diff --git a/compiler-rt/lib/profile/InstrProfiling.h b/compiler-rt/lib/profile/InstrProfiling.h index 7f0c0c194dc91..eaa4ab52c243d 100644 --- a/compiler-rt/lib/profile/InstrProfiling.h +++ b/compiler-rt/lib/profile/InstrProfiling.h @@ -304,6 +304,17 @@ int __llvm_profile_get_padding_sizes_for_counters( */ void __llvm_profile_set_dumped(void); +/*! + * \brief Write custom target-specific profiling data to a seperate file. + * Used by offload PGO. + */ +int __llvm_write_custom_profile(const char *Target, + const __llvm_profile_data *DataBegin, + const __llvm_profile_data *DataEnd, + const char *CountersBegin, + const char *CountersEnd, const char *NamesBegin, + const char *NamesEnd); + /*! * This variable is defined in InstrProfilingRuntime.cpp as a hidden * symbol. Its main purpose is to enable profile runtime user to diff --git a/compiler-rt/lib/profile/InstrProfilingFile.c b/compiler-rt/lib/profile/InstrProfilingFile.c index 343063fd6b754..503d159fd9817 100644 --- a/compiler-rt/lib/profile/InstrProfilingFile.c +++ b/compiler-rt/lib/profile/InstrProfilingFile.c @@ -541,6 +541,17 @@ static FILE *getFileObject(const char *OutputName) { return fopen(OutputName, "ab"); } +static void closeFileObject(FILE *OutputFile) { + if (OutputFile == getProfileFile()) { + fflush(OutputFile); + if (doMerging() && !__llvm_profile_is_continuous_mode_enabled()) { + lprofUnlockFileHandle(OutputFile); + } + } else { + fclose(OutputFile); + } +} + /* Write profile data to file \c OutputName. */ static int writeFile(const char *OutputName) { int RetVal; @@ -562,15 +573,7 @@ static int writeFile(const char *OutputName) { initFileWriter(&fileWriter, OutputFile); RetVal = lprofWriteData(&fileWriter, lprofGetVPDataReader(), MergeDone); - if (OutputFile == getProfileFile()) { - fflush(OutputFile); - if (doMerging() && !__llvm_profile_is_continuous_mode_enabled()) { - lprofUnlockFileHandle(OutputFile); - } - } else { - fclose(OutputFile); - } - + closeFileObject(OutputFile); return RetVal; } @@ -1359,4 +1362,107 @@ COMPILER_RT_VISIBILITY int __llvm_profile_set_file_object(FILE *File, return 0; } +int __llvm_write_custom_profile(const char *Target, + const __llvm_profile_data *DataBegin, + const __llvm_profile_data *DataEnd, + const char *CountersBegin, + const char *CountersEnd, const char *NamesBegin, + const char *NamesEnd) { + int ReturnValue = 0, FilenameLength, TargetLength; + char *FilenameBuf, *TargetFilename; + const char *Filename; + + /* Save old profile data */ + FILE *oldFile = getProfileFile(); + + // Temporarily suspend getting SIGKILL when the parent exits. + int PDeathSig = lprofSuspendSigKill(); + + if (lprofProfileDumped() || __llvm_profile_is_continuous_mode_enabled()) { + PROF_NOTE("Profile data not written to file: %s.\n", "already written"); + if (PDeathSig == 1) + lprofRestoreSigKill(); + return 0; + } + + /* Check if there is llvm/runtime version mismatch. */ + if (GET_VERSION(__llvm_profile_get_version()) != INSTR_PROF_RAW_VERSION) { + PROF_ERR("Runtime and instrumentation version mismatch : " + "expected %d, but get %d\n", + INSTR_PROF_RAW_VERSION, + (int)GET_VERSION(__llvm_profile_get_version())); + if (PDeathSig == 1) + lprofRestoreSigKill(); + return -1; + } + + /* Get current filename */ + FilenameLength = getCurFilenameLength(); + FilenameBuf = (char *)COMPILER_RT_ALLOCA(FilenameLength + 1); + Filename = getCurFilename(FilenameBuf, 0); + + /* Check the filename. */ + if (!Filename) { + PROF_ERR("Failed to write file : %s\n", "Filename not set"); + if (PDeathSig == 1) + lprofRestoreSigKill(); + return -1; + } + + /* Allocate new space for our target-specific PGO filename */ + TargetLength = strlen(Target); + TargetFilename = + (char *)COMPILER_RT_ALLOCA(FilenameLength + TargetLength + 2); + + /* Find file basename and path sizes */ + int32_t DirEnd = FilenameLength - 1; + while (DirEnd >= 0 && !IS_DIR_SEPARATOR(Filename[DirEnd])) { + DirEnd--; + } + uint32_t DirSize = DirEnd + 1, BaseSize = FilenameLength - DirSize; + + /* Prepend "TARGET." to current filename */ + if (DirSize > 0) { + memcpy(TargetFilename, Filename, DirSize); + } + memcpy(TargetFilename + DirSize, Target, TargetLength); + TargetFilename[TargetLength + DirSize] = '.'; + memcpy(TargetFilename + DirSize + 1 + TargetLength, Filename + DirSize, + BaseSize); + TargetFilename[FilenameLength + 1 + TargetLength] = 0; + + /* Open and truncate target-specific PGO file */ + FILE *OutputFile = fopen(TargetFilename, "w"); + setProfileFile(OutputFile); + + if (!OutputFile) { + PROF_ERR("Failed to open file : %s\n", TargetFilename); + if (PDeathSig == 1) + lprofRestoreSigKill(); + return -1; + } + + FreeHook = &free; + setupIOBuffer(); + + /* Write custom data */ + ProfDataWriter fileWriter; + initFileWriter(&fileWriter, OutputFile); + + /* Write custom data to the file */ + ReturnValue = lprofWriteDataImpl( + &fileWriter, DataBegin, DataEnd, CountersBegin, CountersEnd, NULL, NULL, + lprofGetVPDataReader(), NULL, NULL, NULL, NULL, NamesBegin, NamesEnd, 0); + closeFileObject(OutputFile); + + // Restore SIGKILL. + if (PDeathSig == 1) + lprofRestoreSigKill(); + + /* Restore old profiling file */ + setProfileFile(oldFile); + + return ReturnValue; +} + #endif diff --git a/offload/include/Shared/Environment.h b/offload/include/Shared/Environment.h index 147583c209fc3..db8443a7be933 100644 --- a/offload/include/Shared/Environment.h +++ b/offload/include/Shared/Environment.h @@ -30,6 +30,7 @@ enum class DeviceDebugKind : uint32_t { FunctionTracing = 1U << 1, CommonIssues = 1U << 2, AllocationTracker = 1U << 3, + PGODump = 1U << 4, }; struct DeviceEnvironmentTy { diff --git a/offload/plugins-nextgen/common/include/GlobalHandler.h b/offload/plugins-nextgen/common/include/GlobalHandler.h index 7c7e6de613c9f..e030ab9e6b61f 100644 --- a/offload/plugins-nextgen/common/include/GlobalHandler.h +++ b/offload/plugins-nextgen/common/include/GlobalHandler.h @@ -63,14 +63,22 @@ struct __llvm_profile_data { #include "llvm/ProfileData/InstrProfData.inc" }; +extern "C" { +extern int __attribute__((weak)) __llvm_write_custom_profile( + const char *Target, const __llvm_profile_data *DataBegin, + const __llvm_profile_data *DataEnd, const char *CountersBegin, + const char *CountersEnd, const char *NamesBegin, const char *NamesEnd); +} + /// PGO profiling data extracted from a GPU device struct GPUProfGlobals { - SmallVector NamesData; - SmallVector> Counts; + SmallVector Counts; SmallVector<__llvm_profile_data> Data; + SmallVector NamesData; Triple TargetTriple; void dump() const; + Error write() const; }; /// Subclass of GlobalTy that holds the memory for a global of \p Ty. diff --git a/offload/plugins-nextgen/common/src/GlobalHandler.cpp b/offload/plugins-nextgen/common/src/GlobalHandler.cpp index 8854fc52205a7..8783490831e25 100644 --- a/offload/plugins-nextgen/common/src/GlobalHandler.cpp +++ b/offload/plugins-nextgen/common/src/GlobalHandler.cpp @@ -206,7 +206,7 @@ GenericGlobalHandlerTy::readProfilingGlobals(GenericDeviceTy &Device, GlobalTy CountGlobal(NameOrErr->str(), Sym.getSize(), Counts.data()); if (auto Err = readGlobalFromDevice(Device, Image, CountGlobal)) return Err; - DeviceProfileData.Counts.push_back(std::move(Counts)); + DeviceProfileData.Counts.append(std::move(Counts)); } else if (NameOrErr->starts_with(getInstrProfDataVarPrefix())) { // Read profiling data for this global variable __llvm_profile_data Data{}; @@ -224,15 +224,14 @@ void GPUProfGlobals::dump() const { << "\n"; outs() << "======== Counters =========\n"; - for (const auto &Count : Counts) { - outs() << "["; - for (size_t i = 0; i < Count.size(); i++) { - if (i == 0) - outs() << " "; - outs() << Count[i] << " "; - } - outs() << "]\n"; + for (size_t i = 0; i < Counts.size(); i++) { + if (i > 0 && i % 10 == 0) + outs() << "\n"; + else if (i != 0) + outs() << " "; + outs() << Counts[i]; } + outs() << "\n"; outs() << "========== Data ===========\n"; for (const auto &ProfData : Data) { @@ -264,3 +263,43 @@ void GPUProfGlobals::dump() const { Symtab.dumpNames(outs()); outs() << "===========================\n"; } + +Error GPUProfGlobals::write() const { + if (!__llvm_write_custom_profile) + return Plugin::error("Could not find symbol __llvm_write_custom_profile. " + "The compiler-rt profiling library must be linked for " + "GPU PGO to work."); + + size_t DataSize = Data.size() * sizeof(__llvm_profile_data), + CountsSize = Counts.size() * sizeof(int64_t); + __llvm_profile_data *DataBegin, *DataEnd; + char *CountersBegin, *CountersEnd, *NamesBegin, *NamesEnd; + + // Initialize array of contiguous data. We need to make sure each section is + // contiguous so that the PGO library can compute deltas properly + SmallVector ContiguousData(NamesData.size() + DataSize + CountsSize); + + // Compute region pointers + DataBegin = (__llvm_profile_data *)(ContiguousData.data() + CountsSize); + DataEnd = + (__llvm_profile_data *)(ContiguousData.data() + CountsSize + DataSize); + CountersBegin = (char *)ContiguousData.data(); + CountersEnd = (char *)(ContiguousData.data() + CountsSize); + NamesBegin = (char *)(ContiguousData.data() + CountsSize + DataSize); + NamesEnd = (char *)(ContiguousData.data() + CountsSize + DataSize + + NamesData.size()); + + // Copy data to contiguous buffer + memcpy(DataBegin, Data.data(), DataSize); + memcpy(CountersBegin, Counts.data(), CountsSize); + memcpy(NamesBegin, NamesData.data(), NamesData.size()); + + // Invoke compiler-rt entrypoint + int result = __llvm_write_custom_profile(TargetTriple.str().c_str(), + DataBegin, DataEnd, CountersBegin, + CountersEnd, NamesBegin, NamesEnd); + if (result != 0) + return Plugin::error("Error writing GPU PGO data to file"); + + return Plugin::success(); +} diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp index 48c9b671c1a91..bb3fc77258f3e 100644 --- a/offload/plugins-nextgen/common/src/PluginInterface.cpp +++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp @@ -861,8 +861,14 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) { if (!ProfOrErr) return ProfOrErr.takeError(); - // TODO: write data to profiling file - ProfOrErr->dump(); + // Dump out profdata + if ((OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::PGODump)) == + uint32_t(DeviceDebugKind::PGODump)) + ProfOrErr->dump(); + + // Write data to profiling file + if (auto Err = ProfOrErr->write()) + return Err; } // Delete the memory manager before deinitializing the device. Otherwise, diff --git a/offload/test/lit.cfg b/offload/test/lit.cfg index 658ae5f9653ba..1e265d2c30904 100644 --- a/offload/test/lit.cfg +++ b/offload/test/lit.cfg @@ -112,8 +112,10 @@ config.available_features.add(config.libomptarget_current_target) if config.libomptarget_has_libc: config.available_features.add('libc') +profdata_path = os.path.join(config.bin_llvm_tools_dir, "llvm-profdata") if config.libomptarget_test_pgo: config.available_features.add('pgo') + config.substitutions.append(("%profdata", profdata_path)) # Determine whether the test system supports unified memory. # For CUDA, this is the case with compute capability 70 (Volta) or higher. @@ -407,6 +409,8 @@ if config.test_fortran_compiler: config.available_features.add('flang') config.substitutions.append(("%flang", config.test_fortran_compiler)) +config.substitutions.append(("%target_triple", config.libomptarget_current_target)) + config.substitutions.append(("%openmp_flags", config.test_openmp_flags)) if config.libomptarget_current_target.startswith('nvptx') and config.cuda_path: config.substitutions.append(("%cuda_flags", "--cuda-path=" + config.cuda_path)) diff --git a/offload/test/lit.site.cfg.in b/offload/test/lit.site.cfg.in index ce3f6abf50a13..00f4e2b74a5b0 100644 --- a/offload/test/lit.site.cfg.in +++ b/offload/test/lit.site.cfg.in @@ -1,6 +1,6 @@ @AUTO_GEN_COMMENT@ -config.bin_llvm_tools_dir = "@CMAKE_BINARY_DIR@/bin" +config.bin_llvm_tools_dir = "@LLVM_RUNTIME_OUTPUT_INTDIR@" config.test_c_compiler = "@OPENMP_TEST_C_COMPILER@" config.test_cxx_compiler = "@OPENMP_TEST_CXX_COMPILER@" config.test_fortran_compiler="@OPENMP_TEST_Fortran_COMPILER@" diff --git a/offload/test/offloading/pgo1.c b/offload/test/offloading/pgo1.c index 5dc1e5d95caf3..9f23703028bec 100644 --- a/offload/test/offloading/pgo1.c +++ b/offload/test/offloading/pgo1.c @@ -1,12 +1,17 @@ -// RUN: %libomptarget-compile-generic -fprofile-instr-generate \ -// RUN: -Xclang "-fprofile-instrument=clang" -// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic \ -// RUN: --check-prefix="CLANG-PGO" // RUN: %libomptarget-compile-generic -fprofile-generate \ // RUN: -Xclang "-fprofile-instrument=llvm" -// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic \ +// RUN: env LLVM_PROFILE_FILE=llvm.profraw %libomptarget-run-generic 2>&1 +// RUN: %profdata show --all-functions --counts \ +// RUN: %target_triple.llvm.profraw | %fcheck-generic \ // RUN: --check-prefix="LLVM-PGO" +// RUN: %libomptarget-compile-generic -fprofile-instr-generate \ +// RUN: -Xclang "-fprofile-instrument=clang" +// RUN: env LLVM_PROFILE_FILE=clang.profraw %libomptarget-run-generic 2>&1 +// RUN: %profdata show --all-functions --counts \ +// RUN: %target_triple.clang.profraw | %fcheck-generic \ +// RUN: --check-prefix="CLANG-PGO" + // REQUIRES: gpu // REQUIRES: pgo diff --git a/openmp/docs/design/Runtimes.rst b/openmp/docs/design/Runtimes.rst index 951c651f42f29..cd78a5ba88e2c 100644 --- a/openmp/docs/design/Runtimes.rst +++ b/openmp/docs/design/Runtimes.rst @@ -1522,3 +1522,4 @@ debugging features are supported. * Enable debugging assertions in the device. ``0x01`` * Enable diagnosing common problems during offloading . ``0x4`` * Enable device malloc statistics (amdgpu only). ``0x8`` + * Dump device PGO counters (only if PGO on GPU is enabled). ``0x10``