diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 32ddf80999..0a08c0e2ed 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -145,7 +145,9 @@ set( MIOpen_Source rnn_api.cpp rnn/rnn_util.cpp rnn/Solutions/rnn_transformer.cpp + softmax.cpp softmax_api.cpp + softmax/problem_description.cpp solution.cpp solver.cpp solver/activ/bwd_0.cpp @@ -253,6 +255,7 @@ set( MIOpen_Source solver/pooling/backwardNd.cpp solver/reduce/forward_argmax.cpp solver/reduce/forward_sum.cpp + solver/softmax/softmax.cpp subbuffers.cpp sum_api.cpp target_properties.cpp @@ -565,7 +568,6 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN ocl/mloNorm.cpp ocl/pooling_ocl.cpp ocl/tensorocl.cpp - ocl/softmaxocl.cpp ocl/rnnocl.cpp ocl/utilocl.cpp ocl/ctcocl.cpp diff --git a/src/include/miopen/ctc.hpp b/src/include/miopen/ctc.hpp index 0757814e47..546c22de6f 100644 --- a/src/include/miopen/ctc.hpp +++ b/src/include/miopen/ctc.hpp @@ -56,7 +56,7 @@ struct CTCLossDescriptor : miopenCTCLossDescriptor const int* inputLengths, miopenCTCLossAlgo_t algo) const; - void CTCLoss(const Handle& handle, + void CTCLoss(Handle& handle, const TensorDescriptor& probsDesc, ConstData_t probs, const int* labels, diff --git a/src/include/miopen/softmax.hpp b/src/include/miopen/softmax.hpp index 62dcb7d579..b4f8909908 100644 --- a/src/include/miopen/softmax.hpp +++ b/src/include/miopen/softmax.hpp @@ -34,7 +34,7 @@ namespace miopen { struct Handle; struct TensorDescriptor; -miopenStatus_t SoftmaxForward(const Handle& handle, +miopenStatus_t SoftmaxForward(Handle& handle, const void* alpha, const void* beta, const TensorDescriptor& xDesc, @@ -46,7 +46,7 @@ miopenStatus_t SoftmaxForward(const Handle& handle, int x_offset = 0, int y_offset = 0); -miopenStatus_t SoftmaxBackward(const Handle& handle, +miopenStatus_t SoftmaxBackward(Handle& handle, const void* alpha, const TensorDescriptor& yDesc, ConstData_t y, diff --git a/src/include/miopen/softmax/invoke_params.hpp b/src/include/miopen/softmax/invoke_params.hpp new file mode 100644 index 0000000000..c2792929ab --- /dev/null +++ b/src/include/miopen/softmax/invoke_params.hpp @@ -0,0 +1,147 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#pragma once + +#include +#include + +namespace miopen { +namespace softmax { + +struct InvokeParams : public miopen::InvokeParams +{ + InvokeParams(const void* alpha_, + const void* beta_, + const TensorDescriptor& xDesc_, + ConstData_t x_, + const TensorDescriptor& yDesc_, + Data_t y_, + miopenSoftmaxAlgorithm_t algorithm_, + miopenSoftmaxMode_t mode_, + int x_offset_ = 0, + int y_offset_ = 0) + : algorithm(algorithm_), + mode(mode_), + + xdxDesc(xDesc_), + x(x_), + dx(nullptr), + + yDesc(yDesc_), + forward_y(y_), + backward_y(nullptr), + + dy(nullptr), + + xdx_offset(x_offset_), + y_offset(y_offset_), + dy_offset(0) + { + InitializeAlphaBeta(alpha_, beta_); + } + + InvokeParams(const void* alpha_, + const void* beta_, + const TensorDescriptor& yDesc_, + ConstData_t y_, + const TensorDescriptor& dyDesc_, + ConstData_t dy_, + const TensorDescriptor& dxDesc_, + Data_t dx_, + miopenSoftmaxAlgorithm_t algorithm_, + miopenSoftmaxMode_t mode_, + int y_offset_, + int dy_offset_, + int dx_offset_) + : algorithm(algorithm_), + mode(mode_), + + xdxDesc(dxDesc_), + x(nullptr), + dx(dx_), + + yDesc(yDesc_), + forward_y(nullptr), + backward_y(y_), + + dyDesc(dyDesc_), + dy(dy_), + + xdx_offset(dx_offset_), + y_offset(y_offset_), + dy_offset(dy_offset_) + { + InitializeAlphaBeta(alpha_, beta_); + } + + std::size_t GetWorkspaceSize() const { return 0; } + Data_t GetWorkspace() const { return nullptr; } + +public: + float alpha; + float beta; + miopenSoftmaxAlgorithm_t algorithm; + miopenSoftmaxMode_t mode; + + // xdxDesc is used for both forward and backward + TensorDescriptor xdxDesc; + ConstData_t x; + Data_t dx; + + TensorDescriptor yDesc; + Data_t forward_y; + ConstData_t backward_y; + + // backward specific part + TensorDescriptor dyDesc; + ConstData_t dy; + + // xdx_offset is used for both forward and backward + int xdx_offset; + int y_offset; + int dy_offset; + +private: + void InitializeAlphaBeta(const void* alpha_, const void* beta_) + { + alpha = 0.0f; + beta = 0.0f; + + if(alpha_ != nullptr) + { + alpha = *(static_cast(alpha_)); + } + + if(beta_ != nullptr) + { + beta = *(static_cast(beta_)); + } + } +}; + +} // namespace softmax +} // namespace miopen diff --git a/src/include/miopen/softmax/problem_description.hpp b/src/include/miopen/softmax/problem_description.hpp new file mode 100644 index 0000000000..9a5e7150b4 --- /dev/null +++ b/src/include/miopen/softmax/problem_description.hpp @@ -0,0 +1,147 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#pragma once + +#include +#include + +namespace miopen { + +struct NetworkConfig; + +namespace softmax { + +struct ProblemDescription : ProblemDescriptionBase +{ + // softmax forward constructor + ProblemDescription(const void* alpha_, + const void* beta_, + const TensorDescriptor& xDesc_, + const TensorDescriptor& yDesc_, + miopenSoftmaxAlgorithm_t algorithm_, + miopenSoftmaxMode_t mode_) + : isForward(true), + xdxDesc(xDesc_), + yDesc(yDesc_), + + algorithm(algorithm_), + mode(mode_) + { + CheckAndAssignAlphaBeta(alpha_, beta_); + + if(xdxDesc.GetType() != yDesc.GetType()) + { + MIOPEN_THROW(miopenStatusBadParm, "Tensor types do not match."); + } + + if(xdxDesc.GetLengths() != yDesc.GetLengths()) + { + MIOPEN_THROW(miopenStatusBadParm, "Tensor dimension lengths do not match."); + } + } + + ProblemDescription(const void* alpha_, + const void* beta_, + const TensorDescriptor& yDesc_, + const TensorDescriptor& dyDesc_, + const TensorDescriptor& dxDesc_, + miopenSoftmaxAlgorithm_t algorithm_, + miopenSoftmaxMode_t mode_) + : isForward(false), + xdxDesc(dxDesc_), + yDesc(yDesc_), + dyDesc(dyDesc_), + algorithm(algorithm_), + mode(mode_) + { + CheckAndAssignAlphaBeta(alpha_, beta_); + + if(yDesc != dyDesc) + { + MIOPEN_THROW(miopenStatusBadParm); + } + + if(xdxDesc.GetType() != dyDesc.GetType()) + { + MIOPEN_THROW(miopenStatusBadParm, "Tensor types do not match."); + } + + if(xdxDesc.GetLengths() != dyDesc.GetLengths()) + { + MIOPEN_THROW(miopenStatusBadParm, "Tensor dimension lengths do not match."); + } + } + + bool IsForward() const { return isForward; } + miopenSoftmaxAlgorithm_t GetAlgorithm() const { return algorithm; } + miopenSoftmaxMode_t GetMode() const { return mode; } + float GetAlpha() const { return alpha; } + float GetBeta() const { return beta; } + + // for forward + const TensorDescriptor& GetXDesc() const { return xdxDesc; } + const TensorDescriptor& GetYDesc() const { return yDesc; } + + // for backward + const TensorDescriptor& GetdYDesc() const { return dyDesc; } + const TensorDescriptor& GetdXDesc() const { return xdxDesc; } + + NetworkConfig MakeNetworkConfig() const override; + +private: + void CheckAndAssignAlphaBeta(const void* alpha_, const void* beta_) + { + if(alpha_ == nullptr) + { + MIOPEN_THROW(miopenStatusBadParm, "Alpha value is nullptr"); + } + + if(beta_ == nullptr) + { + MIOPEN_THROW(miopenStatusBadParm, "Beta value is nullptr"); + } + + alpha = *(static_cast(alpha_)); + beta = *(static_cast(beta_)); + } + + const bool isForward; + + float alpha; + float beta; + + // for forward xDesc is stored in xdxDesc, for backward dxDesc is stored in xdxDesc + TensorDescriptor xdxDesc; + TensorDescriptor yDesc; + TensorDescriptor dyDesc; + + const miopenSoftmaxAlgorithm_t algorithm; + const miopenSoftmaxMode_t mode; +}; + +} // namespace softmax +} // namespace miopen diff --git a/src/include/miopen/softmax/solvers.hpp b/src/include/miopen/softmax/solvers.hpp new file mode 100644 index 0000000000..acbcca4d72 --- /dev/null +++ b/src/include/miopen/softmax/solvers.hpp @@ -0,0 +1,62 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#pragma once + +#include +#include + +#include + +namespace miopen { + +namespace solver { + +namespace softmax { + +using SoftmaxSolver = NonTunableSolverBase; + +struct Softmax final : SoftmaxSolver +{ + const std::string& SolverDbId() const override { return GetSolverDbId(); } + + bool IsApplicable(const ExecutionContext& context, + const miopen::softmax::ProblemDescription& problem) const override; + + ConvSolution GetSolution(const ExecutionContext& context, + const miopen::softmax::ProblemDescription& problem) const override; + + std::size_t GetWorkspaceSize(const ExecutionContext& context, + const miopen::softmax::ProblemDescription& problem) const override; + + bool MayNeedWorkspace() const override { return false; } +}; + +} // namespace softmax + +} // namespace solver + +} // namespace miopen diff --git a/src/ocl/ctcocl.cpp b/src/ocl/ctcocl.cpp index d594544403..f38c4c65d7 100644 --- a/src/ocl/ctcocl.cpp +++ b/src/ocl/ctcocl.cpp @@ -44,7 +44,7 @@ namespace miopen { -void CTCLossDescriptor::CTCLoss(const Handle& handle, +void CTCLossDescriptor::CTCLoss(Handle& handle, const TensorDescriptor& probsDesc, ConstData_t probs, const int* labels, diff --git a/src/ocl/softmaxocl.cpp b/src/ocl/softmaxocl.cpp deleted file mode 100644 index 38e53413e7..0000000000 --- a/src/ocl/softmaxocl.cpp +++ /dev/null @@ -1,613 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2017 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#include -#include -#include -#include -#include - -namespace miopen { - -int nextPow2(int v) -{ - - if(v == 1) - { - return (v << 1); - } - else - { - v--; - v |= v >> 1; - v |= v >> 2; - v |= v >> 4; - v |= v >> 8; - v |= v >> 16; - v++; - return v; - } -} - -miopenStatus_t SoftmaxForward(const Handle& handle, - const void* alpha, - const void* beta, - const TensorDescriptor& xDesc, - ConstData_t x, - const TensorDescriptor& yDesc, - Data_t y, - miopenSoftmaxAlgorithm_t algorithm, - miopenSoftmaxMode_t mode, - int x_offset, - int y_offset) -{ - if(x == nullptr || y == nullptr) - { - MIOPEN_THROW(miopenStatusBadParm, "Null pointer for tensor."); - } - - if(xDesc.GetType() != yDesc.GetType()) - { - MIOPEN_THROW(miopenStatusBadParm, "Tensor types do not match."); - } - - if(xDesc.GetLengths() != yDesc.GetLengths()) - { - MIOPEN_THROW(miopenStatusBadParm, "Tensor dimension lengths do not match."); - } - - int n, c, h, w; - std::tie(n, c, h, w) = tien<4>(yDesc.GetLengths()); - - int in_nstr, in_cstr, in_hstr; - std::tie(in_nstr, in_cstr, in_hstr, std::ignore) = tien<4>(xDesc.GetStrides()); - - int out_nstr, out_cstr, out_hstr; - std::tie(out_nstr, out_cstr, out_hstr, std::ignore) = tien<4>(yDesc.GetStrides()); - - // using workgroup size of 256 by default - int grid_size = mode == MIOPEN_SOFTMAX_MODE_INSTANCE ? n : n * h * w; - int spatial_dim = mode == MIOPEN_SOFTMAX_MODE_INSTANCE ? 1 : h * w; - int vector_size = mode == MIOPEN_SOFTMAX_MODE_INSTANCE ? c * h * w : c; - // num_spatial_dims or pixels each workgroup can compute - int num_batch = vector_size < 256 ? nextPow2(256 / vector_size) : 1; - - const std::vector vld{256, 1, 1}; - - bool usefp16 = false; - bool usefp32 = true; - if(yDesc.GetType() == miopenHalf) - { - usefp16 = true; - usefp32 = false; - } - - auto alpha_fp = *(static_cast(alpha)); - auto beta_fp = *(static_cast(beta)); - - // See Kernels/MIOpenSoftmax.cl for description - if(num_batch == 1) - { // CSR-Vector like approach - - // Control the max. number of workgroups launched so that we do not - // start getting workgroup scheduling overheads - size_t workgroups = std::min(grid_size, 64 * 40 * 8); - const std::vector vgd{workgroups * vld[0], 1, 1}; - - std::string algo_name = "SoftmaxForwardOneBatch"; - std::string network_config = - "sfmfwd-n" + std::to_string(num_batch) + "half" + - std::to_string(static_cast(usefp16)) + "float" + - std::to_string(static_cast(usefp32)) + "g" + std::to_string(vgd[0]) + "l" + - std::to_string(vld[0]) + "dim" + std::to_string(spatial_dim) + "grid" + - std::to_string(grid_size) + "wg" + std::to_string(workgroups) + "v" + - std::to_string(vector_size) + "xpk" + - std::to_string(static_cast(xDesc.IsPacked())) + "ypk" + - std::to_string(static_cast(yDesc.IsPacked())) + "a" + std::to_string(alpha_fp) + - "b" + std::to_string(beta_fp) + "algo" + std::to_string(static_cast(algorithm)) + - "mode" + std::to_string(static_cast(mode)); - - auto&& kernels = handle.GetKernels(algo_name, network_config); - - if(!kernels.empty()) - { - kernels.front()(x, - y, - vector_size, - grid_size, - spatial_dim, - h, - w, - in_nstr, - in_cstr, - in_hstr, - out_nstr, - out_cstr, - out_hstr, - x_offset, - y_offset, - alpha_fp, - beta_fp); - } - else - { - std::string program_name = "MIOpenSoftmax.cl"; - std::string kernel_name = "SoftmaxForward"; - - // compile parameters - std::string parms = "-DNUM_BATCH=" + std::to_string(num_batch) + - " -DMIOPEN_USE_FP16=" + std::to_string(static_cast(usefp16)) + - " -DMIOPEN_USE_FP32=" + std::to_string(static_cast(usefp32)); - - if(algorithm == MIOPEN_SOFTMAX_LOG) - parms += " -DUSE_SOFTMAX_LOG=1"; - else if(algorithm == MIOPEN_SOFTMAX_FAST) - parms += " -DUSE_SOFTMAX_FAST=1"; - else - parms += " -DUSE_SOFTMAX_ACCURATE=1"; - - if(mode == MIOPEN_SOFTMAX_MODE_INSTANCE) - parms += " -DUSE_SOFTMAX_MODE_INSTANCE=1"; - else - parms += " -DUSE_SOFTMAX_MODE_CHANNEL=1"; - - parms += " -DRUN_FORWARD=1"; - parms += " -DIS_INPUT_PACKED=" + std::to_string(static_cast(xDesc.IsPacked())) + - " -DIS_OUTPUT_PACKED=" + std::to_string(static_cast(yDesc.IsPacked())); - - if(!float_equal(alpha_fp, 1.0)) - parms += " -DUSE_ALPHA=1"; - - if(!float_equal(beta_fp, 0)) - parms += " -DUSE_BETA=1"; - - handle.AddKernel(algo_name, network_config, program_name, kernel_name, vld, vgd, parms)( - x, - y, - vector_size, - grid_size, - spatial_dim, - h, - w, - in_nstr, - in_cstr, - in_hstr, - out_nstr, - out_cstr, - out_hstr, - x_offset, - y_offset, - alpha_fp, - beta_fp); - } - } - else - { // CSR-Stream like approach - - // num_threads iterating over channels for one spatial_dim - int batch_size = 256 / num_batch; - // num_channels each threads iterates over to cover all the channels - int u_batch_size = (vector_size > batch_size) ? nextPow2(vector_size / batch_size) : 1; - - size_t workgroups = - (grid_size % num_batch == 0) ? (grid_size / num_batch) : (grid_size / num_batch + 1); - const std::vector vgd{workgroups * vld[0], 1, 1}; - - if((u_batch_size + 1) * 256 > 65536 && yDesc.GetType() == miopenHalf) - MIOPEN_THROW(miopenStatusBadParm, "Exceed local memory capacity"); - - std::string algo_name = "SoftmaxForwardMultiBatch"; - std::string network_config = - "sfmfwd-n" + std::to_string(num_batch) + "half" + - std::to_string(static_cast(usefp16)) + "float" + - std::to_string(static_cast(usefp32)) + "g" + std::to_string(vgd[0]) + "l" + - std::to_string(vld[0]) + "dim" + std::to_string(spatial_dim) + "grid" + - std::to_string(grid_size) + "wg" + std::to_string(workgroups) + "v" + - std::to_string(vector_size) + "ubatch" + std::to_string(u_batch_size) + "batch" + - std::to_string(batch_size) + "xpk" + - std::to_string(static_cast(xDesc.IsPacked())) + "ypk" + - std::to_string(static_cast(yDesc.IsPacked())) + "a" + std::to_string(alpha_fp) + - "b" + std::to_string(beta_fp) + "algo" + std::to_string(static_cast(algorithm)) + - "mode" + std::to_string(static_cast(mode)); - - auto&& kernels = handle.GetKernels(algo_name, network_config); - - if(!kernels.empty()) - { - kernels.front()(x, - y, - vector_size, - grid_size, - spatial_dim, - h, - w, - in_nstr, - in_cstr, - in_hstr, - out_nstr, - out_cstr, - out_hstr, - x_offset, - y_offset, - alpha_fp, - beta_fp); - } - else - { - std::string program_name = "MIOpenSoftmax.cl"; - std::string kernel_name = "SoftmaxForward"; - std::string parms = "-DNUM_BATCH=" + std::to_string(num_batch) + - " -DBATCH_SIZE=" + std::to_string(batch_size) + - " -DU_BATCH_SIZE=" + std::to_string(u_batch_size) + - " -DMIOPEN_USE_FP16=" + std::to_string(static_cast(usefp16)) + - " -DMIOPEN_USE_FP32=" + std::to_string(static_cast(usefp32)); - - if(algorithm == MIOPEN_SOFTMAX_LOG) - parms += " -DUSE_SOFTMAX_LOG=1"; - else if(algorithm == MIOPEN_SOFTMAX_FAST) - parms += " -DUSE_SOFTMAX_FAST=1"; - else - parms += " -DUSE_SOFTMAX_ACCURATE=1"; - - if(mode == MIOPEN_SOFTMAX_MODE_INSTANCE) - parms += " -DUSE_SOFTMAX_MODE_INSTANCE=1"; - else - parms += " -DUSE_SOFTMAX_MODE_CHANNEL=1"; - - parms += " -DRUN_FORWARD=1"; - parms += " -DIS_INPUT_PACKED=" + std::to_string(static_cast(xDesc.IsPacked())) + - " -DIS_OUTPUT_PACKED=" + std::to_string(static_cast(yDesc.IsPacked())); - - if(!float_equal(alpha_fp, 1.0)) - parms += " -DUSE_ALPHA=1"; - - if(!float_equal(beta_fp, 0)) - parms += " -DUSE_BETA=1"; - - handle.AddKernel(algo_name, network_config, program_name, kernel_name, vld, vgd, parms)( - x, - y, - vector_size, - grid_size, - spatial_dim, - h, - w, - in_nstr, - in_cstr, - in_hstr, - out_nstr, - out_cstr, - out_hstr, - x_offset, - y_offset, - alpha_fp, - beta_fp); - } - } - if(miopen::CheckNumericsEnabled()) - { - miopen::checkNumericsOutput(handle, yDesc, y); - } - return miopenStatusSuccess; -} - -miopenStatus_t SoftmaxBackward(const Handle& handle, - const void* alpha, - const TensorDescriptor& yDesc, - ConstData_t y, - const TensorDescriptor& dyDesc, - ConstData_t dy, - const void* beta, - const TensorDescriptor& dxDesc, - Data_t dx, - miopenSoftmaxAlgorithm_t algorithm, - miopenSoftmaxMode_t mode, - int y_offset, - int dy_offset, - int dx_offset) -{ - if(dx == nullptr || y == nullptr || dy == nullptr) - { - MIOPEN_THROW(miopenStatusBadParm, "Null pointer for tensor."); - } - - if(yDesc != dyDesc) - { - MIOPEN_THROW(miopenStatusBadParm); - } - - if(dxDesc.GetType() != dyDesc.GetType()) - { - MIOPEN_THROW(miopenStatusBadParm, "Tensor types do not match."); - } - - if(dxDesc.GetLengths() != dyDesc.GetLengths()) - { - MIOPEN_THROW(miopenStatusBadParm, "Tensor dimension lengths do not match."); - } - - if(miopen::CheckNumericsEnabled()) - { - miopen::checkNumericsInput(handle, yDesc, y); - } - - int n, c, h, w; - std::tie(n, c, h, w) = tien<4>(dxDesc.GetLengths()); - - int din_nstr, din_cstr, din_hstr; - std::tie(din_nstr, din_cstr, din_hstr, std::ignore) = tien<4>(dxDesc.GetStrides()); - - int dout_nstr, dout_cstr, dout_hstr; - std::tie(dout_nstr, dout_cstr, dout_hstr, std::ignore) = tien<4>(dyDesc.GetStrides()); - - int out_nstr, out_cstr, out_hstr; - std::tie(out_nstr, out_cstr, out_hstr, std::ignore) = tien<4>(yDesc.GetStrides()); - - // using workgroup size of 256 by default - int grid_size = mode == MIOPEN_SOFTMAX_MODE_INSTANCE ? n : n * h * w; - int spatial_dim = mode == MIOPEN_SOFTMAX_MODE_INSTANCE ? 1 : h * w; - int vector_size = mode == MIOPEN_SOFTMAX_MODE_INSTANCE ? c * h * w : c; - // num_spatial_dims or pixels each workgroup can compute - int num_batch = vector_size < 256 ? nextPow2(256 / vector_size) : 1; - - const std::vector vld{256, 1, 1}; - - bool usefp16 = false; - bool usefp32 = true; - if(yDesc.GetType() == miopenHalf) - { - usefp16 = true; - usefp32 = false; - } - - auto alpha_fp = *(static_cast(alpha)); - auto beta_fp = *(static_cast(beta)); - - // See Kernels/MIOpenSoftmax.cl for description - if(num_batch == 1) - { // CSR-Vector like approach - - // Control the max. number of workgroups launched so that we do not - // start getting workgroup scheduling overheads - size_t workgroups = std::min(grid_size, 64 * 40 * 8); - const std::vector vgd{workgroups * vld[0], 1, 1}; - - std::string algo_name = "SoftmaxBackwardOneBatch"; - std::string network_config = - "sfmbwd-n" + std::to_string(num_batch) + "half" + - std::to_string(static_cast(usefp16)) + "float" + - std::to_string(static_cast(usefp32)) + "g" + std::to_string(vgd[0]) + "l" + - std::to_string(vld[0]) + "dim" + std::to_string(spatial_dim) + "grid" + - std::to_string(grid_size) + "wg" + std::to_string(workgroups) + "v" + - std::to_string(vector_size) + "ypk" + - std::to_string(static_cast(yDesc.IsPacked())) + "dypk" + - std::to_string(static_cast(dyDesc.IsPacked())) + "dxpk" + - std::to_string(static_cast(dxDesc.IsPacked())) + "a" + std::to_string(alpha_fp) + - "b" + std::to_string(beta_fp) + "algo" + std::to_string(static_cast(algorithm)) + - "mode" + std::to_string(static_cast(mode)); - - auto&& kernels = handle.GetKernels(algo_name, network_config); - - if(!kernels.empty()) - { - kernels.front()(y, - dy, - dx, - vector_size, - grid_size, - spatial_dim, - h, - w, - out_nstr, - out_cstr, - out_hstr, - dout_nstr, - dout_cstr, - dout_hstr, - din_nstr, - din_cstr, - din_hstr, - y_offset, - dy_offset, - dx_offset, - alpha_fp, - beta_fp); - } - else - { - std::string program_name = "MIOpenSoftmax.cl"; - std::string kernel_name = "SoftmaxBackward"; - std::string parms = "-DNUM_BATCH=" + std::to_string(num_batch) + - " -DMIOPEN_USE_FP16=" + std::to_string(static_cast(usefp16)) + - " -DMIOPEN_USE_FP32=" + std::to_string(static_cast(usefp32)); - - if(algorithm == MIOPEN_SOFTMAX_LOG) - parms += " -DUSE_SOFTMAX_LOG=1"; - else if(algorithm == MIOPEN_SOFTMAX_FAST) - parms += " -DUSE_SOFTMAX_FAST=1"; - else - parms += " -DUSE_SOFTMAX_ACCURATE=1"; - - if(mode == MIOPEN_SOFTMAX_MODE_INSTANCE) - parms += " -DUSE_SOFTMAX_MODE_INSTANCE=1"; - else - parms += " -DUSE_SOFTMAX_MODE_CHANNEL=1"; - - parms += " -DRUN_FORWARD=0"; - parms += " -DIS_OUTPUT_PACKED=" + std::to_string(static_cast(yDesc.IsPacked())) + - " -DIS_DOUTPUT_PACKED=" + std::to_string(static_cast(dyDesc.IsPacked())) + - " -DIS_DINPUT_PACKED=" + std::to_string(static_cast(dxDesc.IsPacked())); - - if(!float_equal(alpha_fp, 1.0)) - parms += " -DUSE_ALPHA=1"; - - if(!float_equal(beta_fp, 0)) - parms += " -DUSE_BETA=1"; - - handle.AddKernel(algo_name, network_config, program_name, kernel_name, vld, vgd, parms)( - y, - dy, - dx, - vector_size, - grid_size, - spatial_dim, - h, - w, - out_nstr, - out_cstr, - out_hstr, - dout_nstr, - dout_cstr, - dout_hstr, - din_nstr, - din_cstr, - din_hstr, - y_offset, - dy_offset, - dx_offset, - alpha_fp, - beta_fp); - } - } - else - { // CSR-Stream like approach - int batch_size = 256 / num_batch; - int u_batch_size = (vector_size > batch_size) ? nextPow2(vector_size / batch_size) : 1; - size_t workgroups = - (grid_size % num_batch == 0) ? (grid_size / num_batch) : (grid_size / num_batch + 1); - const std::vector vgd{workgroups * vld[0], 1, 1}; - - if((2 * u_batch_size + 1) * 256 > 65536 && yDesc.GetType() == miopenHalf) - MIOPEN_THROW(miopenStatusBadParm, "Exceed local memory capacity"); - - std::string algo_name = "SoftmaxBackwardMultiBatch"; - std::string network_config = - "sfmbwd-n" + std::to_string(num_batch) + "half" + - std::to_string(static_cast(usefp16)) + "float" + - std::to_string(static_cast(usefp32)) + "g" + std::to_string(vgd[0]) + "l" + - std::to_string(vld[0]) + "dim" + std::to_string(spatial_dim) + "grid" + - std::to_string(grid_size) + "wg" + std::to_string(workgroups) + "v" + - std::to_string(vector_size) + "ubatch" + std::to_string(u_batch_size) + "batch" + - std::to_string(batch_size) + "ypk" + - std::to_string(static_cast(yDesc.IsPacked())) + "dypk" + - std::to_string(static_cast(dyDesc.IsPacked())) + "dxpk" + - std::to_string(static_cast(dxDesc.IsPacked())) + "a" + std::to_string(alpha_fp) + - "b" + std::to_string(beta_fp) + "algo" + std::to_string(static_cast(algorithm)) + - "mode" + std::to_string(static_cast(mode)); - - auto&& kernels = handle.GetKernels(algo_name, network_config); - - if(!kernels.empty()) - { - kernels.front()(y, - dy, - dx, - vector_size, - grid_size, - spatial_dim, - h, - w, - out_nstr, - out_cstr, - out_hstr, - dout_nstr, - dout_cstr, - dout_hstr, - din_nstr, - din_cstr, - din_hstr, - y_offset, - dy_offset, - dx_offset, - alpha_fp, - beta_fp); - } - else - { - std::string program_name = "MIOpenSoftmax.cl"; - std::string kernel_name = "SoftmaxBackward"; - std::string parms = "-DNUM_BATCH=" + std::to_string(num_batch) + - " -DBATCH_SIZE=" + std::to_string(batch_size) + - " -DU_BATCH_SIZE=" + std::to_string(u_batch_size) + - " -DMIOPEN_USE_FP16=" + std::to_string(static_cast(usefp16)) + - " -DMIOPEN_USE_FP32=" + std::to_string(static_cast(usefp32)); - - if(algorithm == MIOPEN_SOFTMAX_LOG) - parms += " -DUSE_SOFTMAX_LOG=1"; - else if(algorithm == MIOPEN_SOFTMAX_FAST) - parms += " -DUSE_SOFTMAX_FAST=1"; - else - parms += " -DUSE_SOFTMAX_ACCURATE=1"; - - if(mode == MIOPEN_SOFTMAX_MODE_INSTANCE) - parms += " -DUSE_SOFTMAX_MODE_INSTANCE=1"; - else - parms += " -DUSE_SOFTMAX_MODE_CHANNEL=1"; - - parms += " -DRUN_FORWARD=0"; - parms += " -DIS_OUTPUT_PACKED=" + std::to_string(static_cast(yDesc.IsPacked())) + - " -DIS_DOUTPUT_PACKED=" + std::to_string(static_cast(dyDesc.IsPacked())) + - " -DIS_DINPUT_PACKED=" + std::to_string(static_cast(dxDesc.IsPacked())); - - if(!float_equal(alpha_fp, 1.0)) - parms += " -DUSE_ALPHA=1"; - - if(!float_equal(beta_fp, 0)) - parms += " -DUSE_BETA=1"; - - handle.AddKernel(algo_name, network_config, program_name, kernel_name, vld, vgd, parms)( - y, - dy, - dx, - vector_size, - grid_size, - spatial_dim, - h, - w, - out_nstr, - out_cstr, - out_hstr, - dout_nstr, - dout_cstr, - dout_hstr, - din_nstr, - din_cstr, - din_hstr, - y_offset, - dy_offset, - dx_offset, - alpha_fp, - beta_fp); - } - } - if(miopen::CheckNumericsEnabled()) - { - miopen::checkNumericsOutput(handle, dxDesc, dx); - } - - return miopenStatusSuccess; -} - -} // namespace miopen diff --git a/src/softmax.cpp b/src/softmax.cpp new file mode 100644 index 0000000000..6ac501ad71 --- /dev/null +++ b/src/softmax.cpp @@ -0,0 +1,107 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2017 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace miopen { + +miopenStatus_t SoftmaxForward(Handle& handle, + const void* alpha, + const void* beta, + const TensorDescriptor& xDesc, + ConstData_t x, + const TensorDescriptor& yDesc, + Data_t y, + miopenSoftmaxAlgorithm_t algorithm, + miopenSoftmaxMode_t mode, + int x_offset, + int y_offset) +{ + if(x == nullptr || y == nullptr) + { + MIOPEN_THROW(miopenStatusBadParm, "Null pointer for tensor."); + } + + const auto problem = softmax::ProblemDescription{alpha, beta, xDesc, yDesc, algorithm, mode}; + const auto invoke_params = + softmax::InvokeParams{alpha, beta, xDesc, x, yDesc, y, algorithm, mode, x_offset, y_offset}; + const auto algo = AlgorithmName{"Softmax"}; + const auto solvers = solver::SolverContainer{}; + solvers.ExecutePrimitive(handle, problem, algo, invoke_params); + + return miopenStatusSuccess; +} + +miopenStatus_t SoftmaxBackward(Handle& handle, + const void* alpha, + const TensorDescriptor& yDesc, + ConstData_t y, + const TensorDescriptor& dyDesc, + ConstData_t dy, + const void* beta, + const TensorDescriptor& dxDesc, + Data_t dx, + miopenSoftmaxAlgorithm_t algorithm, + miopenSoftmaxMode_t mode, + int y_offset, + int dy_offset, + int dx_offset) +{ + if(dx == nullptr || y == nullptr || dy == nullptr) + { + MIOPEN_THROW(miopenStatusBadParm, "Null pointer for tensor."); + } + + const auto problem = + softmax::ProblemDescription{alpha, beta, yDesc, dyDesc, dxDesc, algorithm, mode}; + const auto invoke_params = softmax::InvokeParams{alpha, + beta, + yDesc, + y, + dyDesc, + dy, + dxDesc, + dx, + algorithm, + mode, + y_offset, + dy_offset, + dx_offset}; + const auto algo = AlgorithmName{"Softmax"}; + const auto solvers = solver::SolverContainer{}; + solvers.ExecutePrimitive(handle, problem, algo, invoke_params); + + return miopenStatusSuccess; +} + +} // namespace miopen diff --git a/src/softmax/problem_description.cpp b/src/softmax/problem_description.cpp new file mode 100644 index 0000000000..e40a725617 --- /dev/null +++ b/src/softmax/problem_description.cpp @@ -0,0 +1,90 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include + +#include + +namespace miopen { + +namespace softmax { + +NetworkConfig ProblemDescription::MakeNetworkConfig() const +{ + std::string network_config = "sfmfwd-"; + + if(isForward) + { + int n_x, c_x, h_x, w_x; + int n_y, c_y, h_y, w_y; + + std::tie(n_x, c_x, h_x, w_x) = tien<4>(xdxDesc.GetLengths()); + std::tie(n_y, c_y, h_y, w_y) = tien<4>(yDesc.GetLengths()); + + network_config += "n_x" + std::to_string(n_x) + "c_x" + std::to_string(c_x) + "h_x" + + std::to_string(h_x) + "w_x" + std::to_string(w_x) + + + "n_y" + std::to_string(n_y) + "c_y" + std::to_string(c_y) + "h_y" + + std::to_string(h_y) + "w_y" + std::to_string(w_y); + + network_config += "xpk" + std::to_string(static_cast(xdxDesc.IsPacked())) + "ypk" + + std::to_string(static_cast(yDesc.IsPacked())); + } + else + { + int n_y, c_y, h_y, w_y; + int n_dy, c_dy, h_dy, w_dy; + int n_dx, c_dx, h_dx, w_dx; + + std::tie(n_y, c_y, h_y, w_y) = tien<4>(yDesc.GetLengths()); + std::tie(n_dy, c_dy, h_dy, w_dy) = tien<4>(dyDesc.GetLengths()); + std::tie(n_dx, c_dx, h_dx, w_dx) = tien<4>(xdxDesc.GetLengths()); + + network_config += "n_y" + std::to_string(n_y) + "c_y" + std::to_string(c_y) + "h_y" + + std::to_string(h_y) + "w_y" + std::to_string(w_y) + + + "n_dy" + std::to_string(n_dy) + "c_dy" + std::to_string(c_dy) + "h_dy" + + std::to_string(h_dy) + "w_dy" + std::to_string(w_dy) + + + "n_dx" + std::to_string(n_dx) + "c_dx" + std::to_string(c_dx) + "h_dx" + + std::to_string(h_dx) + "w_dx" + std::to_string(w_dx); + + network_config += "ypk" + std::to_string(static_cast(yDesc.IsPacked())) + "dypk" + + std::to_string(static_cast(dyDesc.IsPacked())) + "dxpk" + + std::to_string(static_cast(xdxDesc.IsPacked())); + } + + network_config += "a" + std::to_string(alpha) + "b" + std::to_string(beta) + "algo" + + std::to_string(static_cast(algorithm)) + "mode" + + std::to_string(static_cast(mode)); + + return NetworkConfig{network_config}; +} + +} // namespace softmax + +} // namespace miopen diff --git a/src/solver/softmax/softmax.cpp b/src/solver/softmax/softmax.cpp new file mode 100644 index 0000000000..92b5d35548 --- /dev/null +++ b/src/solver/softmax/softmax.cpp @@ -0,0 +1,341 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include + +#include +#include +#include +#include +#include +#include + +namespace miopen { + +namespace solver { + +namespace softmax { + +int nextPow2(int v) +{ + if(v == 1) + { + return (v << 1); + } + else + { + v--; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + v++; + return v; + } +} + +void getParams(const TensorDescriptor& in_desc, + miopenSoftmaxMode_t in_mode, + int& out_n, + int& out_c, + int& out_h, + int& out_w, + int& out_grid_size, + int& out_spatial_dim, + int& out_vector_size, + int& out_num_batch, + bool& out_usefp16, + bool& out_usefp32, + std::vector& out_vld, + std::vector& out_vgd, + size_t& out_workgroups, + int& out_batch_size, + int& out_u_batch_size) +{ + std::tie(out_n, out_c, out_h, out_w) = tien<4>(in_desc.GetLengths()); + + // using workgroup size of 256 by default + out_grid_size = in_mode == MIOPEN_SOFTMAX_MODE_INSTANCE ? out_n : out_n * out_h * out_w; + out_spatial_dim = in_mode == MIOPEN_SOFTMAX_MODE_INSTANCE ? 1 : out_h * out_w; + out_vector_size = in_mode == MIOPEN_SOFTMAX_MODE_INSTANCE ? out_c * out_h * out_w : out_c; + // num_spatial_dims or pixels each workgroup can compute + + /// \todo Magic numbers + out_num_batch = out_vector_size < 256 ? nextPow2(256 / out_vector_size) : 1; + + out_vld = {256, 1, 1}; + + out_usefp16 = false; + out_usefp32 = true; + if(in_desc.GetType() == miopenHalf) + { + out_usefp16 = true; + out_usefp32 = false; + } + + if(out_num_batch == 1) + { + out_workgroups = std::min(out_grid_size, 64 * 40 * 8); + out_vgd = {out_workgroups * out_vld[0], 1, 1}; + + out_batch_size = 0; + out_u_batch_size = 0; + } + else + { + out_batch_size = 256 / out_num_batch; + // num_channels each threads iterates over to cover all the channels + out_u_batch_size = + (out_vector_size > out_batch_size) ? nextPow2(out_vector_size / out_batch_size) : 1; + + out_workgroups = (out_grid_size % out_num_batch == 0) ? (out_grid_size / out_num_batch) + : (out_grid_size / out_num_batch + 1); + out_vgd = {out_workgroups * out_vld[0], 1, 1}; + } +} + +bool Softmax::IsApplicable( + [[maybe_unused]] const ExecutionContext& context, + [[maybe_unused]] const miopen::softmax::ProblemDescription& problem) const +{ + return true; +} + +std::size_t +Softmax::GetWorkspaceSize([[maybe_unused]] const ExecutionContext& context, + [[maybe_unused]] const miopen::softmax::ProblemDescription& problem) const +{ + return 0; +} + +ConvSolution Softmax::GetSolution([[maybe_unused]] const ExecutionContext& context, + const miopen::softmax::ProblemDescription& problem) const +{ + auto result = ConvSolution{miopenStatusSuccess}; + + auto xDesc = problem.GetXDesc(); + auto yDesc = problem.GetYDesc(); + + auto dxDesc = problem.GetdXDesc(); + auto dyDesc = problem.GetdYDesc(); + + auto alpha = problem.GetAlpha(); + auto beta = problem.GetBeta(); + auto mode = problem.GetMode(); + auto algorithm = problem.GetAlgorithm(); + + bool isForward = problem.IsForward(); + + int n, c, h, w; + // using workgroup size of 256 by default + int grid_size, spatial_dim, vector_size, num_batch; + + std::vector vld; + std::vector vgd; + + bool usefp16, usefp32; + + size_t workgroups; + int batch_size; + int u_batch_size; + + getParams(yDesc, + mode, + n, + c, + h, + w, + grid_size, + spatial_dim, + vector_size, + num_batch, + usefp16, + usefp32, + vld, + vgd, + workgroups, + batch_size, + u_batch_size); + + if(num_batch > 1) + { + if(isForward) + { + /// \todo Magic numbers + if((u_batch_size + 1) * 256 > 65536 && yDesc.GetType() == miopenHalf) + MIOPEN_THROW(miopenStatusBadParm, "Exceed local memory capacity"); + } + else + { + /// \todo Magic numbers + if((2 * u_batch_size + 1) * 256 > 65536 && yDesc.GetType() == miopenHalf) + MIOPEN_THROW(miopenStatusBadParm, "Exceed local memory capacity"); + } + } + + KernelBuildParameters build_params = KernelBuildParameters{{"NUM_BATCH", num_batch}}; + + if(num_batch > 1) + { + build_params.Define("BATCH_SIZE", batch_size); + build_params.Define("U_BATCH_SIZE", u_batch_size); + } + + build_params.Define("MIOPEN_USE_FP16", static_cast(usefp16)); + build_params.Define("MIOPEN_USE_FP32", static_cast(usefp32)); + + if(algorithm == MIOPEN_SOFTMAX_LOG) + build_params.Define("USE_SOFTMAX_LOG", 1); + else if(algorithm == MIOPEN_SOFTMAX_FAST) + build_params.Define("USE_SOFTMAX_FAST", 1); + else + build_params.Define("USE_SOFTMAX_ACCURATE", 1); + + if(mode == MIOPEN_SOFTMAX_MODE_INSTANCE) + build_params.Define("USE_SOFTMAX_MODE_INSTANCE", 1); + else + build_params.Define("USE_SOFTMAX_MODE_CHANNEL", 1); + + build_params.Define("RUN_FORWARD", isForward ? 1 : 0); + + if(isForward) + { + build_params.Define("IS_INPUT_PACKED", static_cast(xDesc.IsPacked())); + build_params.Define("IS_OUTPUT_PACKED", static_cast(yDesc.IsPacked())); + } + else + { + build_params.Define("IS_OUTPUT_PACKED", static_cast(yDesc.IsPacked())); + build_params.Define("IS_DOUTPUT_PACKED", static_cast(dyDesc.IsPacked())); + build_params.Define("IS_DINPUT_PACKED", static_cast(dxDesc.IsPacked())); + } + + if(!float_equal(alpha, 1.0)) + build_params.Define("USE_ALPHA", 1); + + if(!float_equal(beta, 0)) + build_params.Define("USE_BETA", 1); + + auto kernel = KernelInfo{}; + + kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{}); + + kernel.kernel_file = "MIOpenSoftmax.cl"; + + kernel.kernel_name = isForward ? "SoftmaxForward" : "SoftmaxBackward"; + + for(unsigned int i = 0; i < 2; ++i) + { + kernel.l_wk.push_back(vld[i]); + kernel.g_wk.push_back(vgd[i]); + } + + if(isForward) + { + int in_nstr, in_cstr, in_hstr; + std::tie(in_nstr, in_cstr, in_hstr, std::ignore) = tien<4>(xDesc.GetStrides()); + + int out_nstr, out_cstr, out_hstr; + std::tie(out_nstr, out_cstr, out_hstr, std::ignore) = tien<4>(yDesc.GetStrides()); + + result.invoker_factory = [=](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle_.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); + + kernel(params.x, + params.forward_y, + vector_size, + grid_size, + spatial_dim, + h, + w, + in_nstr, + in_cstr, + in_hstr, + out_nstr, + out_cstr, + out_hstr, + params.xdx_offset, + params.y_offset, + alpha, + beta); + }; + }; + } + else + { + int din_nstr, din_cstr, din_hstr; + std::tie(din_nstr, din_cstr, din_hstr, std::ignore) = tien<4>(dxDesc.GetStrides()); + + int dout_nstr, dout_cstr, dout_hstr; + std::tie(dout_nstr, dout_cstr, dout_hstr, std::ignore) = tien<4>(dyDesc.GetStrides()); + + int out_nstr, out_cstr, out_hstr; + std::tie(out_nstr, out_cstr, out_hstr, std::ignore) = tien<4>(yDesc.GetStrides()); + + result.invoker_factory = [=](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle_.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); + + kernel(params.backward_y, + params.dy, + params.dx, + vector_size, + grid_size, + spatial_dim, + h, + w, + out_nstr, + out_cstr, + out_hstr, + dout_nstr, + dout_cstr, + dout_hstr, + din_nstr, + din_cstr, + din_hstr, + params.y_offset, + params.dy_offset, + params.xdx_offset, + alpha, + beta); + }; + }; + } + + result.construction_params.push_back(kernel); + + return result; +} + +} // namespace softmax + +} // namespace solver + +} // namespace miopen