Skip to content

Commit bf8ea41

Browse files
committed
Added MISH activation, use activation=mish in [convolutional] layers
1 parent d628e8e commit bf8ea41

File tree

7 files changed

+85
-14
lines changed

7 files changed

+85
-14
lines changed

include/darknet.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ typedef struct tree {
102102

103103
// activations.h
104104
typedef enum {
105-
LOGISTIC, RELU, RELIE, LINEAR, RAMP, TANH, PLSE, LEAKY, ELU, LOGGY, STAIR, HARDTAN, LHTAN, SELU, SWISH
105+
LOGISTIC, RELU, RELIE, LINEAR, RAMP, TANH, PLSE, LEAKY, ELU, LOGGY, STAIR, HARDTAN, LHTAN, SELU, SWISH, MISH
106106
}ACTIVATION;
107107

108108
// parser.h
@@ -347,7 +347,7 @@ struct layer {
347347
float *col_image;
348348
float * delta;
349349
float * output;
350-
float * output_sigmoid;
350+
float * activation_input;
351351
int delta_pinned;
352352
int output_pinned;
353353
float * loss;
@@ -532,7 +532,7 @@ struct layer {
532532

533533
float * input_antialiasing_gpu;
534534
float * output_gpu;
535-
float * output_sigmoid_gpu;
535+
float * activation_input_gpu;
536536
float * loss_gpu;
537537
float * delta_gpu;
538538
float * rand_gpu;

src/activation_kernels.cu

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,16 @@ __global__ void activate_array_swish_kernel(float *x, int n, float *output_sigmo
199199
}
200200
}
201201

202+
__global__ void activate_array_mish_kernel(float *x, int n, float *activation_input, float *output_gpu)
203+
{
204+
int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
205+
if (i < n) {
206+
float x_val = x[i];
207+
activation_input[i] = x_val; // store value before activation
208+
output_gpu[i] = x_val * tanh_activate_kernel(log(1 + expf(x_val)));
209+
}
210+
}
211+
202212
__global__ void activate_array_leaky_kernel(float *x, int n)
203213
{
204214
int index = blockIdx.x*blockDim.x + threadIdx.x;
@@ -263,6 +273,18 @@ __global__ void gradient_array_swish_kernel(float *x, int n, float *sigmoid_gpu,
263273
}
264274
}
265275

276+
__global__ void gradient_array_mish_kernel(int n, float *activation_input, float *delta)
277+
{
278+
int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
279+
if (i < n) {
280+
float x = activation_input[i];
281+
float d = 2 * expf(x) + expf(2 * x) + 2;
282+
float w = 4 * (x + 1) + 4 * expf(2 * x) + expf(3 * x) + expf(x)*(4 * x + 6);
283+
float derivative = expf(x) * w / (d * d);
284+
delta[i] *= derivative;
285+
}
286+
}
287+
266288
__global__ void gradient_array_leaky_kernel(float *x, int n, float *delta)
267289
{
268290
int index = blockIdx.x*blockDim.x + threadIdx.x;
@@ -333,6 +355,13 @@ extern "C" void activate_array_swish_ongpu(float *x, int n, float *output_sigmoi
333355
CHECK_CUDA(cudaPeekAtLastError());
334356
}
335357

358+
extern "C" void activate_array_mish_ongpu(float *x, int n, float *activation_input_gpu, float *output_gpu)
359+
{
360+
const int num_blocks = get_number_of_blocks(n, BLOCK);
361+
activate_array_mish_kernel << <cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >> >(x, n, activation_input_gpu, output_gpu);
362+
CHECK_CUDA(cudaPeekAtLastError());
363+
}
364+
336365
extern "C" void gradient_array_ongpu(float *x, int n, ACTIVATION a, float *delta)
337366
{
338367
const int num_blocks = get_number_of_blocks(n, BLOCK);
@@ -354,4 +383,11 @@ extern "C" void gradient_array_swish_ongpu(float *x, int n, float *sigmoid_gpu,
354383
const int num_blocks = get_number_of_blocks(n, BLOCK);
355384
gradient_array_swish_kernel << <cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >> > (x, n, sigmoid_gpu, delta);
356385
CHECK_CUDA(cudaPeekAtLastError());
386+
}
387+
388+
extern "C" void gradient_array_mish_ongpu(int n, float *activation_input_gpu, float *delta)
389+
{
390+
const int num_blocks = get_number_of_blocks(n, BLOCK);
391+
gradient_array_mish_kernel << <cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >> > (n, activation_input_gpu, delta);
392+
CHECK_CUDA(cudaPeekAtLastError());
357393
}

src/activations.c

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ ACTIVATION get_activation(char *s)
4646
{
4747
if (strcmp(s, "logistic")==0) return LOGISTIC;
4848
if (strcmp(s, "swish") == 0) return SWISH;
49+
if (strcmp(s, "mish") == 0) return MISH;
4950
if (strcmp(s, "loggy")==0) return LOGGY;
5051
if (strcmp(s, "relu")==0) return RELU;
5152
if (strcmp(s, "elu")==0) return ELU;
@@ -133,6 +134,17 @@ void activate_array_swish(float *x, const int n, float * output_sigmoid, float *
133134
}
134135
}
135136

137+
void activate_array_mish(float *x, const int n, float * activation_input, float * output)
138+
{
139+
int i;
140+
#pragma omp parallel for
141+
for (i = 0; i < n; ++i) {
142+
float x_val = x[i];
143+
activation_input[i] = x_val; // store value before activation
144+
output[i] = x_val * tanh_activate(log(1 + expf(x_val)));
145+
}
146+
}
147+
136148
float gradient(float x, ACTIVATION a)
137149
{
138150
switch(a){
@@ -187,3 +199,16 @@ void gradient_array_swish(const float *x, const int n, const float * sigmoid, fl
187199
delta[i] *= swish + sigmoid[i]*(1 - swish);
188200
}
189201
}
202+
203+
void gradient_array_mish(const int n, const float * activation_input, float * delta)
204+
{
205+
int i;
206+
#pragma omp parallel for
207+
for (i = 0; i < n; ++i) {
208+
float x = activation_input[i];
209+
float d = 2 * expf(x) + expf(2 * x) + 2;
210+
float w = 4 * (x + 1) + 4 * expf(2 * x) + expf(3 * x) + expf(x)*(4 * x + 6);
211+
float derivative = expf(x) * w / (d * d);
212+
delta[i] *= derivative;
213+
}
214+
}

src/activations.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
#include "math.h"
66

77
//typedef enum{
8-
// LOGISTIC, RELU, RELIE, LINEAR, RAMP, TANH, PLSE, LEAKY, ELU, LOGGY, STAIR, HARDTAN, LHTAN, SELU
8+
// LOGISTIC, RELU, RELIE, LINEAR, RAMP, TANH, PLSE, LEAKY, ELU, LOGGY, STAIR, HARDTAN, LHTAN, SELU, SWISH, MISH
99
//}ACTIVATION;
1010

1111
#ifdef __cplusplus
@@ -18,13 +18,17 @@ float activate(float x, ACTIVATION a);
1818
float gradient(float x, ACTIVATION a);
1919
void gradient_array(const float *x, const int n, const ACTIVATION a, float *delta);
2020
void gradient_array_swish(const float *x, const int n, const float * sigmoid, float * delta);
21+
void gradient_array_mish(const int n, const float * activation_input, float * delta);
2122
void activate_array(float *x, const int n, const ACTIVATION a);
2223
void activate_array_swish(float *x, const int n, float * output_sigmoid, float * output);
24+
void activate_array_mish(float *x, const int n, float * activation_input, float * output);
2325
#ifdef GPU
2426
void activate_array_ongpu(float *x, int n, ACTIVATION a);
2527
void activate_array_swish_ongpu(float *x, int n, float *output_sigmoid_gpu, float *output_gpu);
28+
void activate_array_mish_ongpu(float *x, int n, float *activation_input_gpu, float *output_gpu);
2629
void gradient_array_ongpu(float *x, int n, ACTIVATION a, float *delta);
2730
void gradient_array_swish_ongpu(float *x, int n, float *sigmoid_gpu, float *delta);
31+
void gradient_array_mish_ongpu(int n, float *activation_input_gpu, float *delta);
2832
#endif
2933

3034
static inline float stair_activate(float x)

src/convolutional_kernels.cu

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -392,7 +392,8 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
392392
*/
393393

394394
//add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h);
395-
if (l.activation == SWISH) activate_array_swish_ongpu(l.output_gpu, l.outputs*l.batch, l.output_sigmoid_gpu, l.output_gpu);
395+
if (l.activation == SWISH) activate_array_swish_ongpu(l.output_gpu, l.outputs*l.batch, l.activation_input_gpu, l.output_gpu);
396+
else if (l.activation == MISH) activate_array_mish_ongpu(l.output_gpu, l.outputs*l.batch, l.activation_input_gpu, l.output_gpu);
396397
else if (l.activation != LINEAR && l.activation != LEAKY) activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
397398
//if(l.activation != LINEAR && l.activation != LEAKY) activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
398399
//if (l.binary || l.xnor) swap_binary(&l);
@@ -596,7 +597,8 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
596597
//#ifndef CUDNN_HALF
597598
//#endif // no CUDNN_HALF
598599

599-
if (l.activation == SWISH) activate_array_swish_ongpu(l.output_gpu, l.outputs*l.batch, l.output_sigmoid_gpu, l.output_gpu);
600+
if (l.activation == SWISH) activate_array_swish_ongpu(l.output_gpu, l.outputs*l.batch, l.activation_input_gpu, l.output_gpu);
601+
else if (l.activation == MISH) activate_array_mish_ongpu(l.output_gpu, l.outputs*l.batch, l.activation_input_gpu, l.output_gpu);
600602
else if (l.activation != LINEAR) activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
601603
//if(l.dot > 0) dot_error_gpu(l);
602604
if(l.binary || l.xnor) swap_binary(&l);
@@ -639,7 +641,8 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state
639641

640642
if(state.net.try_fix_nan) constrain_ongpu(l.outputs*l.batch, 1, l.delta_gpu, 1);
641643

642-
if (l.activation == SWISH) gradient_array_swish_ongpu(l.output_gpu, l.outputs*l.batch, l.output_sigmoid_gpu, l.delta_gpu);
644+
if (l.activation == SWISH) gradient_array_swish_ongpu(l.output_gpu, l.outputs*l.batch, l.activation_input_gpu, l.delta_gpu);
645+
else if (l.activation == MISH) gradient_array_mish_ongpu(l.outputs*l.batch, l.activation_input_gpu, l.delta_gpu);
643646
else gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
644647

645648
if (!l.batch_normalize)

src/convolutional_layer.c

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -473,10 +473,10 @@ convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w,
473473
l.scale_v = (float*)calloc(n, sizeof(float));
474474
}
475475

476-
if(l.activation == SWISH) l.output_sigmoid = (float*)calloc(total_batch*l.outputs, sizeof(float));
476+
if (l.activation == SWISH || l.activation == MISH) l.activation_input = (float*)calloc(total_batch*l.outputs, sizeof(float));
477477

478478
#ifdef GPU
479-
if (l.activation == SWISH) l.output_sigmoid_gpu = cuda_make_array(l.output_sigmoid, total_batch*out_h*out_w*n);
479+
if (l.activation == SWISH || l.activation == MISH) l.activation_input_gpu = cuda_make_array(l.activation_input, total_batch*out_h*out_w*n);
480480

481481
l.forward_gpu = forward_convolutional_layer_gpu;
482482
l.backward_gpu = backward_convolutional_layer_gpu;
@@ -1100,7 +1100,8 @@ void forward_convolutional_layer(convolutional_layer l, network_state state)
11001100
add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w);
11011101

11021102
//activate_array(l.output, m*n*l.batch, l.activation);
1103-
if (l.activation == SWISH) activate_array_swish(l.output, l.outputs*l.batch, l.output_sigmoid, l.output);
1103+
if (l.activation == SWISH) activate_array_swish(l.output, l.outputs*l.batch, l.activation_input, l.output);
1104+
else if (l.activation == MISH) activate_array_mish(l.output, l.outputs*l.batch, l.activation_input, l.output);
11041105
else activate_array_cpu_custom(l.output, m*n*l.batch, l.activation);
11051106
return;
11061107

@@ -1139,7 +1140,8 @@ void forward_convolutional_layer(convolutional_layer l, network_state state)
11391140
add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w);
11401141

11411142
//activate_array(l.output, m*n*l.batch, l.activation);
1142-
if (l.activation == SWISH) activate_array_swish(l.output, l.outputs*l.batch, l.output_sigmoid, l.output);
1143+
if (l.activation == SWISH) activate_array_swish(l.output, l.outputs*l.batch, l.activation_input, l.output);
1144+
else if (l.activation == MISH) activate_array_mish(l.output, l.outputs*l.batch, l.activation_input, l.output);
11431145
else activate_array_cpu_custom(l.output, l.outputs*l.batch, l.activation);
11441146

11451147
if(l.binary || l.xnor) swap_binary(&l);
@@ -1276,7 +1278,8 @@ void backward_convolutional_layer(convolutional_layer l, network_state state)
12761278
int n = l.size*l.size*l.c / l.groups;
12771279
int k = l.out_w*l.out_h;
12781280

1279-
if (l.activation == SWISH) gradient_array_swish(l.output, l.outputs*l.batch, l.output_sigmoid, l.delta);
1281+
if (l.activation == SWISH) gradient_array_swish(l.output, l.outputs*l.batch, l.activation_input, l.delta);
1282+
else if (l.activation == MISH) gradient_array_mish(l.outputs*l.batch, l.activation_input, l.delta);
12801283
else gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta);
12811284

12821285
if (l.batch_normalize) {

src/layer.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ void free_layer(layer l)
9090
#endif // GPU
9191
if (l.delta) free(l.delta), l.delta = NULL;
9292
if (l.output) free(l.output), l.output = NULL;
93-
if (l.output_sigmoid) free(l.output_sigmoid), l.output_sigmoid = NULL;
93+
if (l.activation_input) free(l.activation_input), l.activation_input = NULL;
9494
if (l.squared) free(l.squared);
9595
if (l.norms) free(l.norms);
9696
if (l.spatial_mean) free(l.spatial_mean);
@@ -176,7 +176,7 @@ void free_layer(layer l)
176176
if (l.scale_updates_gpu) cuda_free(l.scale_updates_gpu), l.scale_updates_gpu = NULL;
177177
if (l.input_antialiasing_gpu) cuda_free(l.input_antialiasing_gpu), l.input_antialiasing_gpu = NULL;
178178
if (l.output_gpu) cuda_free(l.output_gpu), l.output_gpu = NULL;
179-
if (l.output_sigmoid_gpu) cuda_free(l.output_sigmoid_gpu), l.output_sigmoid_gpu = NULL;
179+
if (l.activation_input_gpu) cuda_free(l.activation_input_gpu), l.activation_input_gpu = NULL;
180180
if (l.delta_gpu) cuda_free(l.delta_gpu), l.delta_gpu = NULL;
181181
if (l.rand_gpu) cuda_free(l.rand_gpu);
182182
if (l.squared_gpu) cuda_free(l.squared_gpu);

0 commit comments

Comments
 (0)