diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp deleted file mode 100644 index 56e275ce707b6..0000000000000 --- a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp +++ /dev/null @@ -1,744 +0,0 @@ -//===- AMDGPUSplitModule.cpp ----------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -/// \file Implements a module splitting algorithm designed to support the -/// FullLTO --lto-partitions option for parallel codegen. This is completely -/// different from the common SplitModule pass, as this system is designed with -/// AMDGPU in mind. -/// -/// The basic idea of this module splitting implementation is the same as -/// SplitModule: load-balance the module's functions across a set of N -/// partitions to allow parallel codegen. However, it does it very -/// differently than the target-agnostic variant: -/// - Kernels are used as the module's "roots". -/// They're known entry points on AMDGPU, and everything else is often -/// internal only. -/// - Each kernel has a set of dependencies, and when a kernel and its -/// dependencies is considered "big", we try to put it in a partition where -/// most dependencies are already imported, to avoid duplicating large -/// amounts of code. -/// - There's special care for indirect calls in order to ensure -/// AMDGPUResourceUsageAnalysis can work correctly. -/// -/// This file also includes a more elaborate logging system to enable -/// users to easily generate logs that (if desired) do not include any value -/// names, in order to not leak information about the source file. -/// Such logs are very helpful to understand and fix potential issues with -/// module splitting. - -#include "AMDGPUSplitModule.h" -#include "AMDGPUTargetMachine.h" -#include "Utils/AMDGPUBaseInfo.h" -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/StringExtras.h" -#include "llvm/ADT/StringRef.h" -#include "llvm/Analysis/CallGraph.h" -#include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/Instruction.h" -#include "llvm/IR/Module.h" -#include "llvm/IR/User.h" -#include "llvm/IR/Value.h" -#include "llvm/Support/Casting.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/FileSystem.h" -#include "llvm/Support/Path.h" -#include "llvm/Support/Process.h" -#include "llvm/Support/SHA256.h" -#include "llvm/Support/Threading.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Transforms/Utils/Cloning.h" -#include -#include -#include -#include -#include -#include - -using namespace llvm; - -#define DEBUG_TYPE "amdgpu-split-module" - -namespace { - -static cl::opt LargeKernelFactor( - "amdgpu-module-splitting-large-kernel-threshold", cl::init(2.0f), - cl::Hidden, - cl::desc( - "consider a kernel as large and needing special treatment when it " - "exceeds the average cost of a partition by this factor; e;g. 2.0 " - "means if the kernel and its dependencies is 2 times bigger than " - "an average partition; 0 disables large kernels handling entirely")); - -static cl::opt LargeKernelOverlapForMerge( - "amdgpu-module-splitting-large-kernel-merge-overlap", cl::init(0.8f), - cl::Hidden, - cl::desc("defines how much overlap between two large kernel's dependencies " - "is needed to put them in the same partition")); - -static cl::opt NoExternalizeGlobals( - "amdgpu-module-splitting-no-externalize-globals", cl::Hidden, - cl::desc("disables externalization of global variable with local linkage; " - "may cause globals to be duplicated which increases binary size")); - -static cl::opt - LogDirOpt("amdgpu-module-splitting-log-dir", cl::Hidden, - cl::desc("output directory for AMDGPU module splitting logs")); - -static cl::opt - LogPrivate("amdgpu-module-splitting-log-private", cl::Hidden, - cl::desc("hash value names before printing them in the AMDGPU " - "module splitting logs")); - -using CostType = InstructionCost::CostType; -using PartitionID = unsigned; - -static bool isEntryPoint(const Function *F) { - return AMDGPU::isEntryFunctionCC(F->getCallingConv()); -} - -static std::string getName(const Value &V) { - static bool HideNames; - - static llvm::once_flag HideNameInitFlag; - llvm::call_once(HideNameInitFlag, [&]() { - if (LogPrivate.getNumOccurrences()) - HideNames = LogPrivate; - else { - const auto EV = sys::Process::GetEnv("AMD_SPLIT_MODULE_LOG_PRIVATE"); - HideNames = (EV.value_or("0") != "0"); - } - }); - - if (!HideNames) - return V.getName().str(); - return toHex(SHA256::hash(arrayRefFromStringRef(V.getName())), - /*LowerCase=*/true); -} - -/// Main logging helper. -/// -/// Logging can be configured by the following environment variable. -/// AMD_SPLIT_MODULE_LOG_DIR= -/// If set, uses as the directory to write logfiles to -/// each time module splitting is used. -/// AMD_SPLIT_MODULE_LOG_PRIVATE -/// If set to anything other than zero, all names are hidden. -/// -/// Both environment variables have corresponding CL options which -/// takes priority over them. -/// -/// Any output printed to the log files is also printed to dbgs() when -debug is -/// used and LLVM_DEBUG is defined. -/// -/// This approach has a small disadvantage over LLVM_DEBUG though: logging logic -/// cannot be removed from the code (by building without debug). This probably -/// has a small performance cost because if some computation/formatting is -/// needed for logging purpose, it may be done everytime only to be ignored -/// by the logger. -/// -/// As this pass only runs once and is not doing anything computationally -/// expensive, this is likely a reasonable trade-off. -/// -/// If some computation should really be avoided when unused, users of the class -/// can check whether any logging will occur by using the bool operator. -/// -/// \code -/// if (SML) { -/// // Executes only if logging to a file or if -debug is available and -/// used. -/// } -/// \endcode -class SplitModuleLogger { -public: - SplitModuleLogger(const Module &M) { - std::string LogDir = LogDirOpt; - if (LogDir.empty()) - LogDir = sys::Process::GetEnv("AMD_SPLIT_MODULE_LOG_DIR").value_or(""); - - // No log dir specified means we don't need to log to a file. - // We may still log to dbgs(), though. - if (LogDir.empty()) - return; - - // If a log directory is specified, create a new file with a unique name in - // that directory. - int Fd; - SmallString<0> PathTemplate; - SmallString<0> RealPath; - sys::path::append(PathTemplate, LogDir, "Module-%%-%%-%%-%%-%%-%%-%%.txt"); - if (auto Err = - sys::fs::createUniqueFile(PathTemplate.str(), Fd, RealPath)) { - report_fatal_error("Failed to create log file at '" + Twine(LogDir) + - "': " + Err.message(), - /*CrashDiag=*/false); - } - - FileOS = std::make_unique(Fd, /*shouldClose=*/true); - } - - bool hasLogFile() const { return FileOS != nullptr; } - - raw_ostream &logfile() { - assert(FileOS && "no logfile!"); - return *FileOS; - } - - /// \returns true if this SML will log anything either to a file or dbgs(). - /// Can be used to avoid expensive computations that are ignored when logging - /// is disabled. - operator bool() const { - return hasLogFile() || (DebugFlag && isCurrentDebugType(DEBUG_TYPE)); - } - -private: - std::unique_ptr FileOS; -}; - -template -static SplitModuleLogger &operator<<(SplitModuleLogger &SML, const Ty &Val) { - static_assert( - !std::is_same_v, - "do not print values to logs directly, use handleName instead!"); - LLVM_DEBUG(dbgs() << Val); - if (SML.hasLogFile()) - SML.logfile() << Val; - return SML; -} - -/// Calculate the cost of each function in \p M -/// \param SML Log Helper -/// \param TM TargetMachine instance used to retrieve TargetTransformInfo. -/// \param M Module to analyze. -/// \param CostMap[out] Resulting Function -> Cost map. -/// \return The module's total cost. -static CostType -calculateFunctionCosts(SplitModuleLogger &SML, const AMDGPUTargetMachine &TM, - Module &M, - DenseMap &CostMap) { - CostType ModuleCost = 0; - CostType KernelCost = 0; - - for (auto &Fn : M) { - if (Fn.isDeclaration()) - continue; - - CostType FnCost = 0; - TargetTransformInfo TTI = TM.getTargetTransformInfo(Fn); - - for (const auto &BB : Fn) { - for (const auto &I : BB) { - auto Cost = - TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize); - assert(Cost != InstructionCost::getMax()); - // Assume expensive if we can't tell the cost of an instruction. - CostType CostVal = - Cost.getValue().value_or(TargetTransformInfo::TCC_Expensive); - assert((FnCost + CostVal) >= FnCost && "Overflow!"); - FnCost += CostVal; - } - } - - assert(FnCost != 0); - - CostMap[&Fn] = FnCost; - assert((ModuleCost + FnCost) >= ModuleCost && "Overflow!"); - ModuleCost += FnCost; - - if (isEntryPoint(&Fn)) - KernelCost += FnCost; - } - - CostType FnCost = (ModuleCost - KernelCost); - SML << "=> Total Module Cost: " << ModuleCost << '\n' - << " => KernelCost: " << KernelCost << " (" - << format("%0.2f", (float(KernelCost) / ModuleCost) * 100) << "%)\n" - << " => FnsCost: " << FnCost << " (" - << format("%0.2f", (float(FnCost) / ModuleCost) * 100) << "%)\n"; - - return ModuleCost; -} - -static bool canBeIndirectlyCalled(const Function &F) { - if (F.isDeclaration() || isEntryPoint(&F)) - return false; - return !F.hasLocalLinkage() || - F.hasAddressTaken(/*PutOffender=*/nullptr, - /*IgnoreCallbackUses=*/false, - /*IgnoreAssumeLikeCalls=*/true, - /*IgnoreLLVMUsed=*/true, - /*IgnoreARCAttachedCall=*/false, - /*IgnoreCastedDirectCall=*/true); -} - -/// When a kernel or any of its callees performs an indirect call, this function -/// takes over \ref addAllDependencies and adds all potentially callable -/// functions to \p Fns so they can be counted as dependencies of the kernel. -/// -/// This is needed due to how AMDGPUResourceUsageAnalysis operates: in the -/// presence of an indirect call, the function's resource usage is the same as -/// the most expensive function in the module. -/// \param M The module. -/// \param Fns[out] Resulting list of functions. -static void addAllIndirectCallDependencies(const Module &M, - DenseSet &Fns) { - for (const auto &Fn : M) { - if (canBeIndirectlyCalled(Fn)) - Fns.insert(&Fn); - } -} - -/// Adds the functions that \p Fn may call to \p Fns, then recurses into each -/// callee until all reachable functions have been gathered. -/// -/// \param SML Log Helper -/// \param CG Call graph for \p Fn's module. -/// \param Fn Current function to look at. -/// \param Fns[out] Resulting list of functions. -/// \param HadIndirectCall[out] Set to true if an indirect call was seen at some -/// point, either in \p Fn or in one of the function it calls. When that -/// happens, we fall back to adding all callable functions inside \p Fn's module -/// to \p Fns. -static void addAllDependencies(SplitModuleLogger &SML, const CallGraph &CG, - const Function &Fn, - DenseSet &Fns, - bool &HadIndirectCall) { - assert(!Fn.isDeclaration()); - - const Module &M = *Fn.getParent(); - SmallVector WorkList({&Fn}); - while (!WorkList.empty()) { - const auto &CurFn = *WorkList.pop_back_val(); - assert(!CurFn.isDeclaration()); - - // Scan for an indirect call. If such a call is found, we have to - // conservatively assume this can call all non-entrypoint functions in the - // module. - - for (auto &CGEntry : *CG[&CurFn]) { - auto *CGNode = CGEntry.second; - auto *Callee = CGNode->getFunction(); - if (!Callee) { - // Functions have an edge towards CallsExternalNode if they're external - // declarations, or if they do an indirect call. As we only process - // definitions here, we know this means the function has an indirect - // call. We then have to conservatively assume this can call all - // non-entrypoint functions in the module. - if (CGNode != CG.getCallsExternalNode()) - continue; // this is another function-less node we don't care about. - - SML << "Indirect call detected in " << getName(CurFn) - << " - treating all non-entrypoint functions as " - "potential dependencies\n"; - - // TODO: Print an ORE as well ? - addAllIndirectCallDependencies(M, Fns); - HadIndirectCall = true; - return; - } - - if (Callee->isDeclaration()) - continue; - - auto [It, Inserted] = Fns.insert(Callee); - if (Inserted) - WorkList.push_back(Callee); - } - } -} - -/// Contains information about a kernel and its dependencies. -struct KernelWithDependencies { - KernelWithDependencies(SplitModuleLogger &SML, CallGraph &CG, - const DenseMap &FnCosts, - const Function *Fn) - : Fn(Fn) { - addAllDependencies(SML, CG, *Fn, Dependencies, HasIndirectCall); - TotalCost = FnCosts.at(Fn); - for (const auto *Dep : Dependencies) { - TotalCost += FnCosts.at(Dep); - - // We cannot duplicate functions with external linkage, or functions that - // may be overriden at runtime. - HasNonDuplicatableDependecy |= - (Dep->hasExternalLinkage() || !Dep->isDefinitionExact()); - } - } - - const Function *Fn = nullptr; - DenseSet Dependencies; - /// Whether \p Fn or any of its \ref Dependencies contains an indirect call. - bool HasIndirectCall = false; - /// Whether any of \p Fn's dependencies cannot be duplicated. - bool HasNonDuplicatableDependecy = false; - - CostType TotalCost = 0; - - /// \returns true if this kernel and its dependencies can be considered large - /// according to \p Threshold. - bool isLarge(CostType Threshold) const { - return TotalCost > Threshold && !Dependencies.empty(); - } -}; - -/// Calculates how much overlap there is between \p A and \p B. -/// \return A number between 0.0 and 1.0, where 1.0 means A == B and 0.0 means A -/// and B have no shared elements. Kernels do not count in overlap calculation. -static float calculateOverlap(const DenseSet &A, - const DenseSet &B) { - DenseSet Total; - for (const auto *F : A) { - if (!isEntryPoint(F)) - Total.insert(F); - } - - if (Total.empty()) - return 0.0f; - - unsigned NumCommon = 0; - for (const auto *F : B) { - if (isEntryPoint(F)) - continue; - - auto [It, Inserted] = Total.insert(F); - if (!Inserted) - ++NumCommon; - } - - return static_cast(NumCommon) / Total.size(); -} - -/// Performs all of the partitioning work on \p M. -/// \param SML Log Helper -/// \param M Module to partition. -/// \param NumParts Number of partitions to create. -/// \param ModuleCost Total cost of all functions in \p M. -/// \param FnCosts Map of Function -> Cost -/// \param WorkList Kernels and their dependencies to process in order. -/// \returns The created partitions (a vector of size \p NumParts ) -static std::vector> -doPartitioning(SplitModuleLogger &SML, Module &M, unsigned NumParts, - CostType ModuleCost, - const DenseMap &FnCosts, - const SmallVector &WorkList) { - - SML << "\n--Partitioning Starts--\n"; - - // Calculate a "large kernel threshold". When more than one kernel's total - // import cost exceeds this value, we will try to merge it with other, - // similarly large kernels. - // - // e.g. let two kernels X and Y have a import cost of ~10% of the module, we - // assign X to a partition as usual, but when we get to Y, we check if it's - // worth also putting it in Y's partition. - const CostType LargeKernelThreshold = - LargeKernelFactor ? ((ModuleCost / NumParts) * LargeKernelFactor) - : std::numeric_limits::max(); - - std::vector> Partitions; - Partitions.resize(NumParts); - - // Assign a partition to each kernel, and try to keep the partitions more or - // less balanced. We do that through a priority queue sorted in reverse, so we - // can always look at the partition with the least content. - // - // There are some cases where we will be deliberately unbalanced though. - // - Large kernels: we try to merge with existing partitions to reduce code - // duplication. - // - Kernels with indirect or external calls always go in the first partition - // (P0). - auto ComparePartitions = [](const std::pair &a, - const std::pair &b) { - // When two partitions have the same cost, assign to the one with the - // biggest ID first. This allows us to put things in P0 last, because P0 may - // have other stuff added later. - if (a.second == b.second) - return a.first < b.first; - return a.second > b.second; - }; - - // We can't use priority_queue here because we need to be able to access any - // element. This makes this a bit inefficient as we need to sort it again - // everytime we change it, but it's a very small array anyway (likely under 64 - // partitions) so it's a cheap operation. - std::vector> BalancingQueue; - for (unsigned I = 0; I < NumParts; ++I) - BalancingQueue.push_back(std::make_pair(I, 0)); - - // Helper function to handle assigning a kernel to a partition. This takes - // care of updating the balancing queue. - const auto AssignToPartition = [&](PartitionID PID, - const KernelWithDependencies &KWD) { - auto &FnsInPart = Partitions[PID]; - FnsInPart.insert(KWD.Fn); - FnsInPart.insert(KWD.Dependencies.begin(), KWD.Dependencies.end()); - - SML << "assign " << getName(*KWD.Fn) << " to P" << PID << "\n -> "; - if (!KWD.Dependencies.empty()) { - SML << KWD.Dependencies.size() << " dependencies added\n"; - }; - - // Update the balancing queue. we scan backwards because in the common case - // the partition is at the end. - for (auto &[QueuePID, Cost] : reverse(BalancingQueue)) { - if (QueuePID == PID) { - CostType NewCost = 0; - for (auto *Fn : Partitions[PID]) - NewCost += FnCosts.at(Fn); - - SML << "[Updating P" << PID << " Cost]:" << Cost << " -> " << NewCost; - if (Cost) { - SML << " (" << unsigned(((float(NewCost) / Cost) - 1) * 100) - << "% increase)"; - } - SML << '\n'; - - Cost = NewCost; - } - } - - sort(BalancingQueue, ComparePartitions); - }; - - for (auto &CurKernel : WorkList) { - // When a kernel has indirect calls, it must stay in the first partition - // alongside every reachable non-entry function. This is a nightmare case - // for splitting as it severely limits what we can do. - if (CurKernel.HasIndirectCall) { - SML << "Kernel with indirect call(s): " << getName(*CurKernel.Fn) - << " defaulting to P0\n"; - AssignToPartition(0, CurKernel); - continue; - } - - // When a kernel has non duplicatable dependencies, we have to keep it in - // the first partition as well. This is a conservative approach, a - // finer-grained approach could keep track of which dependencies are - // non-duplicatable exactly and just make sure they're grouped together. - if (CurKernel.HasNonDuplicatableDependecy) { - SML << "Kernel with externally visible dependency " - << getName(*CurKernel.Fn) << " defaulting to P0\n"; - AssignToPartition(0, CurKernel); - continue; - } - - // Be smart with large kernels to avoid duplicating their dependencies. - if (CurKernel.isLarge(LargeKernelThreshold)) { - assert(LargeKernelOverlapForMerge >= 0.0f && - LargeKernelOverlapForMerge <= 1.0f); - SML << "Large Kernel: " << getName(*CurKernel.Fn) - << " - looking for partition with at least " - << format("%0.2f", LargeKernelOverlapForMerge * 100) << "% overlap\n"; - - bool Assigned = false; - for (const auto &[PID, Fns] : enumerate(Partitions)) { - float Overlap = calculateOverlap(CurKernel.Dependencies, Fns); - SML << " => " << format("%0.2f", Overlap * 100) << "% overlap with P" - << PID << '\n'; - if (Overlap > LargeKernelOverlapForMerge) { - SML << " selecting P" << PID << '\n'; - AssignToPartition(PID, CurKernel); - Assigned = true; - } - } - - if (Assigned) - continue; - } - - // Normal "load-balancing", assign to partition with least pressure. - auto [PID, CurCost] = BalancingQueue.back(); - AssignToPartition(PID, CurKernel); - } - - // Work is mostly done now, verify the partioning and add all functions we may - // have missed (= unreachable, or we don't understand how they're reached) to - // P0. - DenseSet AllFunctions; - for (const auto &[Idx, Part] : enumerate(Partitions)) { - CostType Cost = 0; - for (auto *Fn : Part) { - // external linkage functions should exclusively be in the first partition - // at this stage. In theory, we should only ever see external linkage - // functions here if they're kernels, or if they've been added due to a - // kernel using indirect calls somewhere in its CallGraph. - assert(Idx == 0 || (!Fn->hasExternalLinkage() || isEntryPoint(Fn))); - Cost += FnCosts.at(Fn); - } - SML << "P" << Idx << " has a total cost of " << Cost << " (" - << format("%0.2f", (float(Cost) / ModuleCost) * 100) - << "% of source module)\n"; - AllFunctions.insert(Part.begin(), Part.end()); - } - - // Add missed functions to P0. This will take care of adding things like - // external functions with no callers in the module to P0. This should be - // fairly rare as AMDGPU internalizes everything in most cases, so unused - // internal functions would get removed. - for (auto &Fn : M) { - if (!Fn.isDeclaration() && !AllFunctions.contains(&Fn)) { - SML << getName(Fn) << " has no partition assigned, defaulting to P0\n"; - Partitions[0].insert(&Fn); - } - } - - SML << "--Partitioning Done--\n\n"; - - return Partitions; -} - -static void externalize(GlobalValue &GV) { - if (GV.hasLocalLinkage()) { - GV.setLinkage(GlobalValue::ExternalLinkage); - GV.setVisibility(GlobalValue::HiddenVisibility); - } - - // Unnamed entities must be named consistently between modules. setName will - // give a distinct name to each such entity. - if (!GV.hasName()) - GV.setName("__llvmsplit_unnamed"); -} -} // end anonymous namespace - -void llvm::splitAMDGPUModule( - const AMDGPUTargetMachine &TM, Module &M, unsigned N, - function_ref MPart)> ModuleCallback) { - - SplitModuleLogger SML(M); - - CallGraph CG(M); - - // Externalize functions whose address are taken. - // - // This is needed because partitioning is purely based on calls, but sometimes - // a kernel/function may just look at the address of another local function - // and not do anything (no calls). After partitioning, that local function may - // end up in a different module (so it's just a declaration in the module - // where its address is taken), which emits a "undefined hidden symbol" linker - // error. - // - // Additionally, it guides partitioning to not duplicate this function if it's - // called directly at some point. - for (auto &Fn : M) { - if (Fn.hasAddressTaken()) { - if (Fn.hasLocalLinkage()) { - SML << "[externalize] " << Fn.getName() - << " because its address is taken\n"; - } - externalize(Fn); - } - } - - // Externalize local GVs, which avoids duplicating their initializers, which - // in turns helps keep code size in check. - if (!NoExternalizeGlobals) { - for (auto &GV : M.globals()) { - if (GV.hasLocalLinkage()) - SML << "[externalize] GV " << GV.getName() << '\n'; - externalize(GV); - } - } - - // Start by calculating the cost of every function in the module, as well as - // the module's overall cost. - DenseMap FnCosts; - const CostType ModuleCost = calculateFunctionCosts(SML, TM, M, FnCosts); - - // Gather every kernel into a WorkList, then sort it by descending total cost - // of the kernel so the biggest kernels are seen first. - SmallVector WorkList; - for (auto &Fn : M) { - if (isEntryPoint(&Fn) && !Fn.isDeclaration()) - WorkList.emplace_back(SML, CG, FnCosts, &Fn); - } - sort(WorkList, [&](auto &A, auto &B) { - // Sort by total cost, and if the total cost is identical, sort - // alphabetically. - if (A.TotalCost == B.TotalCost) - return A.Fn->getName() < B.Fn->getName(); - return A.TotalCost > B.TotalCost; - }); - - if (SML) { - SML << "Worklist\n"; - for (const auto &KWD : WorkList) { - SML << "[Kernel] " << getName(*KWD.Fn) << " (totalCost:" << KWD.TotalCost - << " indirect:" << KWD.HasIndirectCall - << " hasNonDuplicatableDep:" << KWD.HasNonDuplicatableDependecy - << ")\n"; - for (const auto *Dep : KWD.Dependencies) - SML << " [Dep] " << getName(*Dep) << '\n'; - } - } - - // This performs all of the partitioning work. - auto Partitions = doPartitioning(SML, M, N, ModuleCost, FnCosts, WorkList); - assert(Partitions.size() == N); - - // If we didn't externalize GVs, then local GVs need to be conservatively - // imported into every module (including their initializers), and then cleaned - // up afterwards. - const auto NeedsConservativeImport = [&](const GlobalValue *GV) { - // We conservatively import private/internal GVs into every module and clean - // them up afterwards. - const auto *Var = dyn_cast(GV); - return Var && Var->hasLocalLinkage(); - }; - - SML << "Creating " << N << " modules...\n"; - unsigned TotalFnImpls = 0; - for (unsigned I = 0; I < N; ++I) { - const auto &FnsInPart = Partitions[I]; - - ValueToValueMapTy VMap; - std::unique_ptr MPart( - CloneModule(M, VMap, [&](const GlobalValue *GV) { - // Functions go in their assigned partition. - if (const auto *Fn = dyn_cast(GV)) { -// Check we don't import an external linkage function in any -// partition other than P0. -#ifndef NDEBUG - if (Fn->hasExternalLinkage() && !isEntryPoint(Fn)) { - assert((I == 0) == FnsInPart.contains(Fn)); - } -#endif - return FnsInPart.contains(Fn); - } - - if (NeedsConservativeImport(GV)) - return true; - - // Everything else goes in the first partition. - return I == 0; - })); - - // Clean-up conservatively imported GVs without any users. - for (auto &GV : make_early_inc_range(MPart->globals())) { - if (NeedsConservativeImport(&GV) && GV.use_empty()) - GV.eraseFromParent(); - } - - unsigned NumAllFns = 0, NumKernels = 0; - for (auto &Cur : *MPart) { - if (!Cur.isDeclaration()) { - ++NumAllFns; - if (isEntryPoint(&Cur)) - ++NumKernels; - } - } - TotalFnImpls += NumAllFns; - SML << " - Module " << I << " with " << NumAllFns << " functions (" - << NumKernels << " kernels)\n"; - ModuleCallback(std::move(MPart)); - } - - SML << TotalFnImpls << " function definitions across all modules (" - << format("%0.2f", (float(TotalFnImpls) / FnCosts.size()) * 100) - << "% of original module)\n"; -} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.h b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.h deleted file mode 100644 index 6171643bd4adc..0000000000000 --- a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.h +++ /dev/null @@ -1,30 +0,0 @@ -//===- AMDGPUSplitModule.h -------------------------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_TARGET_AMDGPUSPLITMODULE_H -#define LLVM_TARGET_AMDGPUSPLITMODULE_H - -#include "llvm/ADT/STLFunctionalExtras.h" -#include - -namespace llvm { - -class Module; -class AMDGPUTargetMachine; - -/// Splits the module M into N linkable partitions. The function ModuleCallback -/// is called N times passing each individual partition as the MPart argument. -void splitAMDGPUModule( - const AMDGPUTargetMachine &TM, Module &M, unsigned N, - function_ref MPart)> ModuleCallback); - -} // end namespace llvm - -#endif // LLVM_TARGET_AMDGPUSPLITMODULE_H diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index dbbfe34a63863..20329dea60275 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -21,7 +21,6 @@ #include "AMDGPUIGroupLP.h" #include "AMDGPUMacroFusion.h" #include "AMDGPURegBankSelect.h" -#include "AMDGPUSplitModule.h" #include "AMDGPUTargetObjectFile.h" #include "AMDGPUTargetTransformInfo.h" #include "AMDGPUUnifyDivergentExitNodes.h" @@ -816,13 +815,6 @@ AMDGPUTargetMachine::getAddressSpaceForPseudoSourceKind(unsigned Kind) const { return AMDGPUAS::FLAT_ADDRESS; } -bool AMDGPUTargetMachine::splitModule( - Module &M, unsigned NumParts, - function_ref MPart)> ModuleCallback) const { - splitAMDGPUModule(*this, M, NumParts, ModuleCallback); - return true; -} - //===----------------------------------------------------------------------===// // GCN Target Machine (SI+) //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h index 2cfd232483a8a..e48cb8fdc6576 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -73,10 +73,6 @@ class AMDGPUTargetMachine : public LLVMTargetMachine { getPredicatedAddrSpace(const Value *V) const override; unsigned getAddressSpaceForPseudoSourceKind(unsigned Kind) const override; - - bool splitModule(Module &M, unsigned NumParts, - function_ref MPart)> - ModuleCallback) const override; }; //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index c992352cb78da..ead81b402eb76 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -98,7 +98,6 @@ add_llvm_target(AMDGPUCodeGen AMDGPURewriteOutArguments.cpp AMDGPURewriteUndefForPHI.cpp AMDGPUSetWavePriority.cpp - AMDGPUSplitModule.cpp AMDGPUSubtarget.cpp AMDGPUTargetMachine.cpp AMDGPUTargetObjectFile.cpp diff --git a/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize-with-call.ll b/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize-with-call.ll deleted file mode 100644 index 8b76237efa325..0000000000000 --- a/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize-with-call.ll +++ /dev/null @@ -1,46 +0,0 @@ -; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-kernel-threshold=0 -; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s -; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s -; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s - -; 3 kernels: -; - A does a direct call to HelperA -; - B is storing @HelperA -; - C does a direct call to HelperA -; -; The helper functions will get externalized, which will force A and C into P0 as -; external functions cannot be duplicated. - -; CHECK0: define hidden void @HelperA() -; CHECK0: define amdgpu_kernel void @A() -; CHECK0: declare amdgpu_kernel void @B(ptr) -; CHECK0: define amdgpu_kernel void @C() - -; CHECK1: declare hidden void @HelperA() -; CHECK1: declare amdgpu_kernel void @A() -; CHECK1: declare amdgpu_kernel void @B(ptr) -; CHECK1: declare amdgpu_kernel void @C() - -; CHECK2: declare hidden void @HelperA() -; CHECK2: declare amdgpu_kernel void @A() -; CHECK2: define amdgpu_kernel void @B(ptr %dst) -; CHECK2: declare amdgpu_kernel void @C() - -define internal void @HelperA() { - ret void -} - -define amdgpu_kernel void @A() { - call void @HelperA() - ret void -} - -define amdgpu_kernel void @B(ptr %dst) { - store ptr @HelperA, ptr %dst - ret void -} - -define amdgpu_kernel void @C() { - call void @HelperA() - ret void -} diff --git a/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize.ll b/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize.ll deleted file mode 100644 index 46d7d9783aeae..0000000000000 --- a/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize.ll +++ /dev/null @@ -1,37 +0,0 @@ -; RUN: llvm-split -o %t %s -j 2 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-kernel-threshold=0 -; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s -; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s - -; 2 kernels: -; - A is isolated -; - B is storing @HelperA/B's address -; -; The helper functions should get externalized (become hidden w/ external linkage) - -; CHECK0: define hidden void @HelperA() -; CHECK0: define hidden void @HelperB() -; CHECK0: define amdgpu_kernel void @A() -; CHECK0: declare amdgpu_kernel void @B(i1, ptr) - -; CHECK1: declare hidden void @HelperA() -; CHECK1: declare hidden void @HelperB() -; CHECK1: declare amdgpu_kernel void @A() -; CHECK1: define amdgpu_kernel void @B(i1 %cond, ptr %dst) - -define internal void @HelperA() { - ret void -} - -define internal void @HelperB() { - ret void -} - -define amdgpu_kernel void @A() { - ret void -} - -define amdgpu_kernel void @B(i1 %cond, ptr %dst) { - %addr = select i1 %cond, ptr @HelperA, ptr @HelperB - store ptr %addr, ptr %dst - ret void -} diff --git a/llvm/test/tools/llvm-split/AMDGPU/debug-name-hiding.ll b/llvm/test/tools/llvm-split/AMDGPU/debug-name-hiding.ll deleted file mode 100644 index 6a07ed51ba1be..0000000000000 --- a/llvm/test/tools/llvm-split/AMDGPU/debug-name-hiding.ll +++ /dev/null @@ -1,20 +0,0 @@ -; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -debug -amdgpu-module-splitting-log-private 2>&1 | FileCheck %s --implicit-check-not=MyCustomKernel -; REQUIRES: asserts - -; SHA256 of the kernel names. - -; CHECK: a097723d21cf9f35d90e6fb7881995ac8c398b3366a6c97efc657404f9fe301c -; CHECK: 626bc23242de8fcfda7f0e66318d29455c081df6b5380e64d14703c95fcbcd59 -; CHECK: c38d90a7ca71dc5d694bb9e093dadcdedfc4cb4adf7ed7e46d42fe95a0b4ef55 - -define amdgpu_kernel void @MyCustomKernel0() { - ret void -} - -define amdgpu_kernel void @MyCustomKernel1() { - ret void -} - -define amdgpu_kernel void @MyCustomKernel2() { - ret void -} diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-alias-dependencies.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-alias-dependencies.ll deleted file mode 100644 index c2746d1398924..0000000000000 --- a/llvm/test/tools/llvm-split/AMDGPU/kernels-alias-dependencies.ll +++ /dev/null @@ -1,45 +0,0 @@ -; RUN: llvm-split -o %t %s -j 2 -mtriple amdgcn-amd-amdhsa -; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s -; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s - -; 3 kernels: -; - A calls nothing -; - B calls @PerryThePlatypus -; - C calls @Perry, an alias of @PerryThePlatypus -; -; We should see through the alias and put B/C in the same -; partition. -; -; Additionally, @PerryThePlatypus gets externalized as -; the alias counts as taking its address. - -; CHECK0-NOT: define -; CHECK0: @Perry = internal alias ptr (), ptr @PerryThePlatypus -; CHECK0: define hidden void @PerryThePlatypus() -; CHECK0: define amdgpu_kernel void @B -; CHECK0: define amdgpu_kernel void @C -; CHECK0-NOT: define - -; CHECK1-NOT: define -; CHECK1: define amdgpu_kernel void @A -; CHECK1-NOT: define - -@Perry = internal alias ptr(), ptr @PerryThePlatypus - -define internal void @PerryThePlatypus() { - ret void -} - -define amdgpu_kernel void @A() { - ret void -} - -define amdgpu_kernel void @B() { - call void @PerryThePlatypus() - ret void -} - -define amdgpu_kernel void @C() { - call void @Perry() - ret void -} diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-cost-ranking.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-cost-ranking.ll deleted file mode 100644 index 4635264aefb39..0000000000000 --- a/llvm/test/tools/llvm-split/AMDGPU/kernels-cost-ranking.ll +++ /dev/null @@ -1,54 +0,0 @@ -; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s -; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s -; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s - -; 3 kernels with each their own dependencies should go into 3 -; distinct partitions. The most expensive kernel should be -; seen first and go into the last partition. - -; CHECK0-NOT: define -; CHECK0: define amdgpu_kernel void @C -; CHECK0: define internal void @HelperC -; CHECK0-NOT: define - -; CHECK1-NOT: define -; CHECK1: define amdgpu_kernel void @A -; CHECK1: define internal void @HelperA -; CHECK1-NOT: define - -; CHECK2-NOT: define -; CHECK2: define amdgpu_kernel void @B -; CHECK2: define internal void @HelperB -; CHECK2-NOT: define - - -define amdgpu_kernel void @A() { - call void @HelperA() - ret void -} - -define internal void @HelperA() { - ret void -} - -define amdgpu_kernel void @B(ptr %x) { - store i64 42, ptr %x - store i64 43, ptr %x - store i64 44, ptr %x - call void @HelperB() - ret void -} - -define internal void @HelperB() { - ret void -} - -define amdgpu_kernel void @C() { - call void @HelperC() - ret void -} - -define internal void @HelperC() { - ret void -} diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependencies.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependencies.ll deleted file mode 100644 index bea527f15bbaa..0000000000000 --- a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependencies.ll +++ /dev/null @@ -1,50 +0,0 @@ -; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s -; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s -; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s - -; 3 kernels with each their own dependencies should go into 3 -; distinct partitions. - -; CHECK0-NOT: define -; CHECK0: define amdgpu_kernel void @C -; CHECK0: define internal void @HelperC -; CHECK0-NOT: define - -; CHECK1-NOT: define -; CHECK1: define amdgpu_kernel void @B -; CHECK1: define internal void @HelperB -; CHECK1-NOT: define - -; CHECK2-NOT: define -; CHECK2: define amdgpu_kernel void @A -; CHECK2: define internal void @HelperA -; CHECK2-NOT: define - - -define amdgpu_kernel void @A() { - call void @HelperA() - ret void -} - -define internal void @HelperA() { - ret void -} - -define amdgpu_kernel void @B() { - call void @HelperB() - ret void -} - -define internal void @HelperB() { - ret void -} - -define amdgpu_kernel void @C() { - call void @HelperC() - ret void -} - -define internal void @HelperC() { - ret void -} diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-duplication.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-duplication.ll deleted file mode 100644 index 64839f8d8456a..0000000000000 --- a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-duplication.ll +++ /dev/null @@ -1,41 +0,0 @@ -; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s -; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s -; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s - -; 3 kernels share a common helper, that helper should be -; cloned in all partitions. - -; CHECK0-NOT: define -; CHECK0: define internal void @Helper -; CHECK0: define amdgpu_kernel void @C -; CHECK0-NOT: define - -; CHECK1-NOT: define -; CHECK1: define internal void @Helper -; CHECK1: define amdgpu_kernel void @B -; CHECK1-NOT: define - -; CHECK2-NOT: define -; CHECK2: define internal void @Helper -; CHECK2: define amdgpu_kernel void @A -; CHECK2-NOT: define - -define internal void @Helper() { - ret void -} - -define amdgpu_kernel void @A() { - call void @Helper() - ret void -} - -define amdgpu_kernel void @B() { - call void @Helper() - ret void -} - -define amdgpu_kernel void @C() { - call void @Helper() - ret void -} diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-external.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-external.ll deleted file mode 100644 index 435e97a581340..0000000000000 --- a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-external.ll +++ /dev/null @@ -1,64 +0,0 @@ -; RUN: llvm-split -o %t %s -j 4 -mtriple amdgcn-amd-amdhsa -; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s -; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s -; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s -; RUN: llvm-dis -o - %t3 | FileCheck --check-prefix=CHECK3 %s - -; Both overridable helper should go in P0. - -; CHECK0-NOT: define -; CHECK0: define available_externally void @OverridableHelper0() -; CHECK0: define internal void @OverridableHelper1() -; CHECK0: define amdgpu_kernel void @A -; CHECK0: define amdgpu_kernel void @B -; CHECK0-NOT: define - -; CHECK1-NOT: define - -; CHECK2-NOT: define -; CHECK2: define internal void @PrivateHelper1() -; CHECK2: define amdgpu_kernel void @D -; CHECK2-NOT: define - -; CHECK3-NOT: define -; CHECK3: define internal void @PrivateHelper0() -; CHECK3: define amdgpu_kernel void @C -; CHECK3-NOT: define - -define available_externally void @OverridableHelper0() { - ret void -} - -define internal void @OverridableHelper1() #0 { - ret void -} - -define internal void @PrivateHelper0() { - ret void -} - -define internal void @PrivateHelper1() { - ret void -} - -define amdgpu_kernel void @A() { - call void @OverridableHelper0() - ret void -} - -define amdgpu_kernel void @B() { - call void @OverridableHelper1() - ret void -} - -define amdgpu_kernel void @C() { - call void @PrivateHelper0() - ret void -} - -define amdgpu_kernel void @D() { - call void @PrivateHelper1() - ret void -} - -attributes #0 = { nobuiltin } diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-indirect.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-indirect.ll deleted file mode 100644 index 9701ac35ce54e..0000000000000 --- a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-indirect.ll +++ /dev/null @@ -1,76 +0,0 @@ -; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s -; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s -; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s - -; We have 4 kernels: -; - Each kernel has an internal helper -; - @A and @B's helpers does an indirect call. -; -; We default to putting A/B in P0, alongside a copy -; of all helpers who have their address taken. -; The other kernels can still go into separate partitions. - -; CHECK0-NOT: define -; CHECK0: define hidden void @HelperA -; CHECK0: define hidden void @HelperB -; CHECK0: define hidden void @CallCandidate -; CHECK0-NOT: define {{.*}} @HelperC -; CHECK0-NOT: define {{.*}} @HelperD -; CHECK0: define amdgpu_kernel void @A -; CHECK0: define amdgpu_kernel void @B -; CHECK0-NOT: define - -; CHECK1-NOT: define -; CHECK1: define internal void @HelperD -; CHECK1: define amdgpu_kernel void @D -; CHECK1-NOT: define - -; CHECK2-NOT: define -; CHECK2: define internal void @HelperC -; CHECK2: define amdgpu_kernel void @C -; CHECK2-NOT: define - -@addrthief = global [3 x ptr] [ptr @HelperA, ptr @HelperB, ptr @CallCandidate] - -define internal void @HelperA(ptr %call) { - call void %call() - ret void -} - -define internal void @HelperB(ptr %call) { - call void %call() - ret void -} - -define internal void @CallCandidate() { - ret void -} - -define internal void @HelperC() { - ret void -} - -define internal void @HelperD() { - ret void -} - -define amdgpu_kernel void @A(ptr %call) { - call void @HelperA(ptr %call) - ret void -} - -define amdgpu_kernel void @B(ptr %call) { - call void @HelperB(ptr %call) - ret void -} - -define amdgpu_kernel void @C() { - call void @HelperC() - ret void -} - -define amdgpu_kernel void @D() { - call void @HelperD() - ret void -} diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-overridable.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-overridable.ll deleted file mode 100644 index dc2c5c3c07bee..0000000000000 --- a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-overridable.ll +++ /dev/null @@ -1,40 +0,0 @@ -; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s -; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s -; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s - -; CHECK0-NOT: define -; CHECK0: define void @ExternalHelper -; CHECK0: define amdgpu_kernel void @A -; CHECK0: define amdgpu_kernel void @B -; CHECK0-NOT: define - -; CHECK1-NOT: define -; CHECK1: define amdgpu_kernel void @D -; CHECK1-NOT: define - -; CHECK2-NOT: define -; CHECK2: define amdgpu_kernel void @C -; CHECK2-NOT: define - -define void @ExternalHelper() { - ret void -} - -define amdgpu_kernel void @A() { - call void @ExternalHelper() - ret void -} - -define amdgpu_kernel void @B() { - call void @ExternalHelper() - ret void -} - -define amdgpu_kernel void @C() { - ret void -} - -define amdgpu_kernel void @D() { - ret void -} diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables-noexternal.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables-noexternal.ll deleted file mode 100644 index 0fc76934afc54..0000000000000 --- a/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables-noexternal.ll +++ /dev/null @@ -1,42 +0,0 @@ -; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-no-externalize-globals -; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s -; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s -; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s - -; 3 kernels use private/internal global variables. -; The GVs should be copied in each partition as needed. - -; CHECK0-NOT: define -; CHECK0: @bar = internal constant ptr -; CHECK0: define amdgpu_kernel void @C -; CHECK0-NOT: define - -; CHECK1-NOT: define -; CHECK1: @foo = private constant ptr -; CHECK1: define amdgpu_kernel void @A -; CHECK1-NOT: define - -; CHECK2-NOT: define -; CHECK2: @foo = private constant ptr -; CHECK2: @bar = internal constant ptr -; CHECK2: define amdgpu_kernel void @B -; CHECK2-NOT: define - -@foo = private constant ptr poison -@bar = internal constant ptr poison - -define amdgpu_kernel void @A() { - store i32 42, ptr @foo - ret void -} - -define amdgpu_kernel void @B() { - store i32 42, ptr @foo - store i32 42, ptr @bar - ret void -} - -define amdgpu_kernel void @C() { - store i32 42, ptr @bar - ret void -} diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables.ll deleted file mode 100644 index 7564662e7c7c0..0000000000000 --- a/llvm/test/tools/llvm-split/AMDGPU/kernels-global-variables.ll +++ /dev/null @@ -1,44 +0,0 @@ -; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s -; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s -; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s - -; 3 kernels use private/internal global variables. -; The GVs should be copied in each partition as needed. - -; CHECK0-NOT: define -; CHECK0: @foo = hidden constant ptr poison -; CHECK0: @bar = hidden constant ptr poison -; CHECK0: define amdgpu_kernel void @C -; CHECK0-NOT: define - -; CHECK1-NOT: define -; CHECK1: @foo = external hidden constant ptr{{$}} -; CHECK1: @bar = external hidden constant ptr{{$}} -; CHECK1: define amdgpu_kernel void @A -; CHECK1-NOT: define - -; CHECK2-NOT: define -; CHECK2: @foo = external hidden constant ptr{{$}} -; CHECK2: @bar = external hidden constant ptr{{$}} -; CHECK2: define amdgpu_kernel void @B -; CHECK2-NOT: define - -@foo = private constant ptr poison -@bar = internal constant ptr poison - -define amdgpu_kernel void @A() { - store i32 42, ptr @foo - ret void -} - -define amdgpu_kernel void @B() { - store i32 42, ptr @foo - store i32 42, ptr @bar - ret void -} - -define amdgpu_kernel void @C() { - store i32 42, ptr @bar - ret void -} diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-load-balancing.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-load-balancing.ll deleted file mode 100644 index 5dfb95c5fc660..0000000000000 --- a/llvm/test/tools/llvm-split/AMDGPU/kernels-load-balancing.ll +++ /dev/null @@ -1,75 +0,0 @@ -; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s -; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s -; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s - -; Test load balancing logic with 6 kernels. -; -; Kernels go from most expensive (A == 6) to least expensive (F == 1) -; -; Load balancing should work like this (current partition cost is in parens) -; -; Initial -> [P0(0), P1(0), P2(0)] -; -; A(6) goes in 2 -> [P2(6), P0(0), P1(0)] -; B(5) goes in 1 -> [P2(6), P1(5), P0(4)] -; C(4) goes in 0 -> [P2(6), P1(5), P0(4)] - -; D(3) goes in 0 -> [P0(7), P2(6), P1(5)] -; E(2) goes in 1 -> [P0(7), P1(7), P2(6)] -; F(1) goes in 2 -> [P0(7), P1(7), P2(7)] - -; CHECK0-NOT: define -; CHECK0: define amdgpu_kernel void @C -; CHECK0: define amdgpu_kernel void @D -; CHECK0-NOT: define - -; CHECK1-NOT: define -; CHECK1: define amdgpu_kernel void @B -; CHECK1: define amdgpu_kernel void @E -; CHECK1-NOT: define - -; CHECK2-NOT: define -; CHECK2: define amdgpu_kernel void @A -; CHECK2: define amdgpu_kernel void @F -; CHECK2-NOT: define - - -define amdgpu_kernel void @A(ptr %x) { - store i64 42, ptr %x - store i64 43, ptr %x - store i64 44, ptr %x - store i64 45, ptr %x - store i64 46, ptr %x - ret void -} - -define amdgpu_kernel void @B(ptr %x) { - store i64 42, ptr %x - store i64 43, ptr %x - store i64 44, ptr %x - store i64 45, ptr %x - ret void -} - -define amdgpu_kernel void @C(ptr %x) { - store i64 42, ptr %x - store i64 43, ptr %x - store i64 44, ptr %x - ret void -} - -define amdgpu_kernel void @D(ptr %x) { - store i64 42, ptr %x - store i64 43, ptr %x - ret void -} - -define amdgpu_kernel void @E(ptr %x) { - store i64 42, ptr %x - ret void -} - -define amdgpu_kernel void @F() { - ret void -} diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-no-dependencies.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-no-dependencies.ll deleted file mode 100644 index 8959acfcae542..0000000000000 --- a/llvm/test/tools/llvm-split/AMDGPU/kernels-no-dependencies.ll +++ /dev/null @@ -1,39 +0,0 @@ -; RUN: llvm-split -o %t %s -j 4 -mtriple amdgcn-amd-amdhsa -; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s -; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s -; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s -; RUN: llvm-dis -o - %t3 | FileCheck --check-prefix=CHECK3 %s - -; Check that 4 independent kernels get put into 4 different partitions. - -; CHECK0-NOT: define -; CHECK0: define amdgpu_kernel void @D -; CHECK0-NOT: define - -; CHECK1-NOT: define -; CHECK1: define amdgpu_kernel void @C -; CHECK1-NOT: define - -; CHECK2-NOT: define -; CHECK2: define amdgpu_kernel void @B -; CHECK2-NOT: define - -; CHECK3-NOT: define -; CHECK3: define amdgpu_kernel void @A -; CHECK3-NOT: define - -define amdgpu_kernel void @A() { - ret void -} - -define amdgpu_kernel void @B() { - ret void -} - -define amdgpu_kernel void @C() { - ret void -} - -define amdgpu_kernel void @D() { - ret void -} diff --git a/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging.ll b/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging.ll deleted file mode 100644 index 4fdbac7d17897..0000000000000 --- a/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging.ll +++ /dev/null @@ -1,98 +0,0 @@ -; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-kernel-threshold=1.2 -amdgpu-module-splitting-large-kernel-merge-overlap=0.5 -; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s -; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s -; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s - -; RUN: llvm-split -o %t.nolarge %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-kernel-threshold=0 -; RUN: llvm-dis -o - %t.nolarge0 | FileCheck --check-prefix=NOLARGEKERNELS-CHECK0 %s -; RUN: llvm-dis -o - %t.nolarge1 | FileCheck --check-prefix=NOLARGEKERNELS-CHECK1 %s -; RUN: llvm-dis -o - %t.nolarge2 | FileCheck --check-prefix=NOLARGEKERNELS-CHECK2 %s - -; 2 kernels (A/B) are large and share all their dependencies. -; They should go in the same partition, the remaining kernel should -; go somewhere else, and one partition should be empty. -; -; Also check w/o large kernels processing to verify they are indeed handled -; differently. - -; CHECK0-NOT: define - -; CHECK1-NOT: define -; CHECK1: define internal void @HelperC() -; CHECK1: define amdgpu_kernel void @C -; CHECK1-NOT: define - -; CHECK2-NOT: define -; CHECK2: define internal void @large2() -; CHECK2: define internal void @large1() -; CHECK2: define internal void @large0() -; CHECK2: define internal void @HelperA() -; CHECK2: define internal void @HelperB() -; CHECK2: define amdgpu_kernel void @A -; CHECK2: define amdgpu_kernel void @B -; CHECK2-NOT: define - -; NOLARGEKERNELS-CHECK0-NOT: define -; NOLARGEKERNELS-CHECK0: define internal void @HelperC() -; NOLARGEKERNELS-CHECK0: define amdgpu_kernel void @C -; NOLARGEKERNELS-CHECK0-NOT: define - -; NOLARGEKERNELS-CHECK1: define internal void @large2() -; NOLARGEKERNELS-CHECK1: define internal void @large1() -; NOLARGEKERNELS-CHECK1: define internal void @large0() -; NOLARGEKERNELS-CHECK1: define internal void @HelperB() -; NOLARGEKERNELS-CHECK1: define amdgpu_kernel void @B - -; NOLARGEKERNELS-CHECK2: define internal void @large2() -; NOLARGEKERNELS-CHECK2: define internal void @large1() -; NOLARGEKERNELS-CHECK2: define internal void @large0() -; NOLARGEKERNELS-CHECK2: define internal void @HelperA() -; NOLARGEKERNELS-CHECK2: define amdgpu_kernel void @A - -define internal void @large2() { - store volatile i32 42, ptr null - call void @large2() - ret void -} - -define internal void @large1() { - call void @large1() - call void @large2() - ret void -} - -define internal void @large0() { - call void @large0() - call void @large1() - call void @large2() - ret void -} - -define internal void @HelperA() { - call void @large0() - ret void -} - -define internal void @HelperB() { - call void @large0() - ret void -} - -define amdgpu_kernel void @A() { - call void @HelperA() - ret void -} - -define amdgpu_kernel void @B() { - call void @HelperB() - ret void -} - -define internal void @HelperC() { - ret void -} - -define amdgpu_kernel void @C() { - call void @HelperC() - ret void -} diff --git a/llvm/test/tools/llvm-split/AMDGPU/lit.local.cfg b/llvm/test/tools/llvm-split/AMDGPU/lit.local.cfg deleted file mode 100644 index 6154a6c1c9061..0000000000000 --- a/llvm/test/tools/llvm-split/AMDGPU/lit.local.cfg +++ /dev/null @@ -1,2 +0,0 @@ -if not "AMDGPU" in config.root.targets: - config.unsupported = True