add gelu vulkan operator (#5001)

FhqTreap · web-flow · commit cc54b889d5c1 · 2023-09-04T17:17:24.000+08:00
diff --git a/src/layer/vulkan/gelu_vulkan.cpp b/src/layer/vulkan/gelu_vulkan.cpp
@@ -0,0 +1,181 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "gelu_vulkan.h"
+
+#include "layer_shader_type.h"
+
+namespace ncnn {
+
+GELU_vulkan::GELU_vulkan()
+{
+    support_vulkan = true;
+    support_image_storage = true;
+
+    pipeline_gelu = 0;
+    pipeline_gelu_pack4 = 0;
+    pipeline_gelu_pack8 = 0;
+}
+
+int GELU_vulkan::create_pipeline(const Option& opt)
+{
+    const Mat& shape = top_shapes.empty() ? Mat() : top_shapes[0];
+
+    int elempack = 1;
+    if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1;
+    if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1;
+    if (shape.dims == 3 || shape.dims == 4) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1;
+
+    size_t elemsize;
+    if (opt.use_fp16_storage)
+    {
+        elemsize = elempack * 2u;
+    }
+    else if (opt.use_fp16_packed)
+    {
+        elemsize = elempack == 1 ? 4u : elempack * 2u;
+    }
+    else
+    {
+        elemsize = elempack * 4u;
+    }
+
+    Mat shape_packed;
+    if (shape.dims == 1) shape_packed = Mat(shape.w / elempack, (void*)0, elemsize, elempack);
+    if (shape.dims == 2) shape_packed = Mat(shape.w, shape.h / elempack, (void*)0, elemsize, elempack);
+    if (shape.dims == 3) shape_packed = Mat(shape.w, shape.h, shape.c / elempack, (void*)0, elemsize, elempack);
+    if (shape.dims == 4) shape_packed = Mat(shape.w, shape.h, shape.d, shape.c / elempack, (void*)0, elemsize, elempack);
+
+    std::vector<vk_specialization_type> specializations(0 + 5);
+    specializations[0 + 0].i = shape_packed.dims;
+    specializations[0 + 1].i = shape_packed.w;
+    specializations[0 + 2].i = shape_packed.h * shape_packed.d;
+    specializations[0 + 3].i = shape_packed.c;
+    specializations[0 + 4].i = shape_packed.cstep;
+
+    Mat local_size_xyz;
+    if (shape_packed.dims == 1)
+    {
+        local_size_xyz.w = std::min(64, shape_packed.w);
+        local_size_xyz.h = 1;
+        local_size_xyz.c = 1;
+    }
+    if (shape_packed.dims == 2)
+    {
+        local_size_xyz.w = std::min(8, shape_packed.w);
+        local_size_xyz.h = std::min(8, shape_packed.h);
+        local_size_xyz.c = 1;
+    }
+    if (shape_packed.dims == 3)
+    {
+        local_size_xyz.w = std::min(4, shape_packed.w);
+        local_size_xyz.h = std::min(4, shape_packed.h);
+        local_size_xyz.c = std::min(4, shape_packed.c);
+    }
+    if (shape_packed.dims == 4)
+    {
+        local_size_xyz.w = std::min(4, shape_packed.w);
+        local_size_xyz.h = std::min(4, shape_packed.h * shape_packed.d);
+        local_size_xyz.c = std::min(4, shape_packed.c);
+    }
+
+    // pack1
+    if (shape.dims == 0 || elempack == 1)
+    {
+        pipeline_gelu = new Pipeline(vkdev);
+        pipeline_gelu->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_gelu->create(LayerShaderType::gelu, opt, specializations);
+    }
+
+    // pack4
+    if (shape.dims == 0 || elempack == 4)
+    {
+        pipeline_gelu_pack4 = new Pipeline(vkdev);
+        pipeline_gelu_pack4->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_gelu_pack4->create(LayerShaderType::gelu_pack4, opt, specializations);
+    }
+
+    // pack8
+    if ((opt.use_shader_pack8 && shape.dims == 0) || elempack == 8)
+    {
+        pipeline_gelu_pack8 = new Pipeline(vkdev);
+        pipeline_gelu_pack8->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_gelu_pack8->create(LayerShaderType::gelu_pack8, opt, specializations);
+    }
+
+    return 0;
+}
+
+int GELU_vulkan::destroy_pipeline(const Option& /*opt*/)
+{
+    delete pipeline_gelu;
+    pipeline_gelu = 0;
+
+    delete pipeline_gelu_pack4;
+    pipeline_gelu_pack4 = 0;
+
+    delete pipeline_gelu_pack8;
+    pipeline_gelu_pack8 = 0;
+
+    return 0;
+}
+
+int GELU_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, const Option& /*opt*/) const
+{
+    int elempack = bottom_top_blob.elempack;
+
+    std::vector<VkMat> bindings(1);
+    bindings[0] = bottom_top_blob;
+
+    std::vector<vk_constant_type> constants(5);
+    constants[0].i = bottom_top_blob.dims;
+    constants[1].i = bottom_top_blob.w;
+    constants[2].i = bottom_top_blob.h * bottom_top_blob.d;
+    constants[3].i = bottom_top_blob.c;
+    constants[4].i = bottom_top_blob.cstep;
+
+    const Pipeline* pipeline = elempack == 8 ? pipeline_gelu_pack8
+                               : elempack == 4 ? pipeline_gelu_pack4
+                               : pipeline_gelu;
+
+    cmd.record_pipeline(pipeline, bindings, constants, bottom_top_blob);
+
+    return 0;
+}
+
+int GELU_vulkan::forward_inplace(VkImageMat& bottom_top_blob, VkCompute& cmd, const Option& /*opt*/) const
+{
+    int elempack = bottom_top_blob.elempack;
+
+    std::vector<VkImageMat> bindings(2);
+    bindings[0] = bottom_top_blob;
+    bindings[1] = bottom_top_blob;
+
+    std::vector<vk_constant_type> constants(5);
+    constants[0].i = bottom_top_blob.dims;
+    constants[1].i = bottom_top_blob.w;
+    constants[2].i = bottom_top_blob.h * bottom_top_blob.d;
+    constants[3].i = bottom_top_blob.c;
+    constants[4].i = 0; //bottom_top_blob.cstep;
+
+    const Pipeline* pipeline = elempack == 8 ? pipeline_gelu_pack8
+                               : elempack == 4 ? pipeline_gelu_pack4
+                               : pipeline_gelu;
+
+    cmd.record_pipeline(pipeline, bindings, constants, bottom_top_blob);
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/vulkan/gelu_vulkan.h b/src/layer/vulkan/gelu_vulkan.h
@@ -0,0 +1,42 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_GELU_VULKAN_H
+#define LAYER_GELU_VULKAN_H
+
+#include "gelu.h"
+
+namespace ncnn {
+
+class GELU_vulkan : virtual public GELU
+{
+public:
+    GELU_vulkan();
+
+    virtual int create_pipeline(const Option& opt);
+    virtual int destroy_pipeline(const Option& opt);
+
+    using GELU::forward_inplace;
+    virtual int forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const;
+    virtual int forward_inplace(VkImageMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const;
+
+public:
+    Pipeline* pipeline_gelu;
+    Pipeline* pipeline_gelu_pack4;
+    Pipeline* pipeline_gelu_pack8;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_GELU_VULKAN_H
diff --git a/src/layer/vulkan/shader/gelu.comp b/src/layer/vulkan/shader/gelu.comp
@@ -0,0 +1,72 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+#define shape_constant_id_offset 0
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob_3d;
+#else
+layout (binding = 0) buffer bottom_top_blob { sfp bottom_top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c))
+        return;
+
+#if NCNN_image_shader
+    afp v = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, gz));
+#else
+    const int gi = gz * psc(cstep) + gy * psc(w) + gx;
+
+    afp v = buffer_ld1(bottom_top_blob_data, gi);
+#endif
+
+    // y = 0.5x * (1 + tanh(sqrt(2/Pi) * (x + 0.044715x^3)))
+    v = 0.5f * v * (1.0f + tanh(0.79788452f * (v + 0.044715f * v * v * v)));
+
+#if NCNN_image_shader
+    image3d_st1(top_blob_3d, ivec3(gx, gy, gz), v);
+#else
+    buffer_st1(bottom_top_blob_data, gi, v);
+#endif
+}
diff --git a/src/layer/vulkan/shader/gelu_pack4.comp b/src/layer/vulkan/shader/gelu_pack4.comp
@@ -0,0 +1,72 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+#define shape_constant_id_offset 0
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d;
+#else
+layout (binding = 0) buffer bottom_top_blob { sfpvec4 bottom_top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c))
+        return;
+
+#if NCNN_image_shader
+    afpvec4 v = image3d_ld4(bottom_blob_3d, ivec3(gx, gy, gz));
+#else
+    const int gi = gz * psc(cstep) + gy * psc(w) + gx;
+
+    afpvec4 v = buffer_ld4(bottom_top_blob_data, gi);
+#endif
+
+    // y = 0.5x * (1 + tanh(sqrt(2/Pi) * (x + 0.044715x^3)))
+    v = 0.5f * v * (1.0f + tanh(0.79788452f * (v + 0.044715f * v * v * v)));
+
+#if NCNN_image_shader
+    image3d_st4(top_blob_3d, ivec3(gx, gy, gz), v);
+#else
+    buffer_st4(bottom_top_blob_data, gi, v);
+#endif
+}
diff --git a/src/layer/vulkan/shader/gelu_pack8.comp b/src/layer/vulkan/shader/gelu_pack8.comp