From 6fb817f68b471da21c04f8bd5ffef0d3c4d67c30 Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Sat, 22 Feb 2020 01:04:55 +0300
Subject: [PATCH] Accelerated DropBlock on GPU

---
 include/darknet.h            |   2 +
 src/dropout_layer.c          |   6 +-
 src/dropout_layer_kernels.cu | 139 ++++++++++++++++++++++++++++++++++-
 src/layer.c                  |   4 +-
 4 files changed, 144 insertions(+), 7 deletions(-)

diff --git a/include/darknet.h b/include/darknet.h
index ac50e49e..a13168da 100644
--- a/include/darknet.h
+++ b/include/darknet.h
@@ -586,6 +586,8 @@ struct layer {
     float * loss_gpu;
     float * delta_gpu;
     float * rand_gpu;
+    float * drop_blocks_scale;
+    float * drop_blocks_scale_gpu;
     float * squared_gpu;
     float * norms_gpu;
 
diff --git a/src/dropout_layer.c b/src/dropout_layer.c
index 395e6a35..3778a7b9 100644
--- a/src/dropout_layer.c
+++ b/src/dropout_layer.c
@@ -29,11 +29,13 @@ dropout_layer make_dropout_layer(int batch, int inputs, float probability, int d
     l.scale = 1./(1.0 - probability);
     l.forward = forward_dropout_layer;
     l.backward = backward_dropout_layer;
-    #ifdef GPU
+#ifdef GPU
     l.forward_gpu = forward_dropout_layer_gpu;
     l.backward_gpu = backward_dropout_layer_gpu;
     l.rand_gpu = cuda_make_array(l.rand, inputs*batch);
-    #endif
+    l.drop_blocks_scale = cuda_make_array_pinned(l.rand, l.batch);
+    l.drop_blocks_scale_gpu = cuda_make_array(l.rand, l.batch);
+#endif
     if (l.dropblock) {
         if(l.dropblock_size_abs) fprintf(stderr, "dropblock       p = %.2f   l.dropblock_size_abs = %d         %4d  ->   %4d\n", probability, l.dropblock_size_abs, inputs, inputs);
         else fprintf(stderr, "dropblock       p = %.2f   l.dropblock_size_rel = %.2f         %4d  ->   %4d\n", probability, l.dropblock_size_rel, inputs, inputs);
diff --git a/src/dropout_layer_kernels.cu b/src/dropout_layer_kernels.cu
index a01bb9c6..1a753e3e 100644
--- a/src/dropout_layer_kernels.cu
+++ b/src/dropout_layer_kernels.cu
@@ -6,6 +6,88 @@
 #include "dropout_layer.h"
 #include "dark_cuda.h"
 #include "utils.h"
+#include "blas.h"
+
+
+__global__ void dropblock_fast_kernel(float *rand, float prob, int w, int h, int spatial, int filters, int block_size, float *drop_blocks_scale, float *output)
+{
+    const int threads = BLOCK;
+    __shared__ int index_block;
+
+    const int id = threadIdx.x;
+    const int f = blockIdx.x % filters;
+    const int b = blockIdx.x / filters;
+
+    if(id == 0) index_block = -1;
+    __syncthreads();
+
+    int i;
+    for (i = 0; i < spatial; i += threads) {
+        int index = b*spatial*f + f*spatial + i + id;
+
+        if (i + id < spatial) {
+            if (rand[id] < prob) {
+                index_block = id;
+            }
+        }
+    }
+
+    __syncthreads();
+    if (index_block == -1) return;
+
+
+    int b_x = index_block % w;
+    int b_y = index_block / w;
+
+    b_x = max(0, min(b_x, w - block_size));
+    b_y = max(0, min(b_y, h - block_size));
+
+    int block_square_size = block_size * block_size;
+
+    for (i = 0; i < block_square_size; i += threads)
+    {
+        int i_x = i % w;
+        int i_y = i / w;
+
+        int x = b_x + i_x;
+        int y = b_y + i_y;
+
+        int index = b*spatial*f + f*spatial + y*w + x;
+
+        output[index] = 0;
+    }
+
+    /*
+    if (id == 0) {
+
+        for (int x = b_x; x < (b_x + block_size); ++x)
+        {
+            for (int y = b_y; y < (b_y + block_size); ++y)
+            {
+                int index = b*spatial*f + f*spatial + y*w + x;
+
+                output[index] = 0;
+            }
+        }
+    }
+    */
+
+    if (id == 0 && drop_blocks_scale) {
+        atomicAdd(&drop_blocks_scale[b], 1);
+    }
+
+}
+
+
+__global__ void scale_dropblock_kernel(float *output, int size, int outputs, float *drop_blocks_scale)
+{
+    const int index = blockIdx.x*blockDim.x + threadIdx.x;
+    if (index >= size) return;
+
+    const int b = size / outputs;
+    output[index] *= drop_blocks_scale[b];
+}
+
 
 __global__ void yoloswag420blazeit360noscope(float *input, int size, float *rand, float prob, float scale)
 {
@@ -27,8 +109,8 @@ void forward_dropout_layer_gpu(dropout_layer l, network_state state)
 
     // We gradually increase the block size and the probability of dropout - during the first half of the training
     float multiplier = 1.0;
-    if(iteration_num < (state.net.max_batches / 2))
-        multiplier = (iteration_num / (float)(state.net.max_batches / 2));
+    if(iteration_num < (state.net.max_batches))
+        multiplier = (iteration_num / (float)(state.net.max_batches));
 
     // dropblock
     if (l.dropblock) {
@@ -47,6 +129,31 @@ void forward_dropout_layer_gpu(dropout_layer l, network_state state)
         block_width = max_val_cmp(1, block_width);
         block_height = max_val_cmp(1, block_height);
 
+        int size = l.inputs*l.batch;
+        cuda_random(l.rand_gpu, size);
+
+        fill_ongpu(l.batch, 0, l.drop_blocks_scale_gpu, 1);
+
+        int num_blocks = l.batch * l.c;
+        dropblock_fast_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> > (l.rand_gpu, cur_prob, l.w, l.h, l.w*l.h, l.c, block_width, l.drop_blocks_scale_gpu, state.input);
+        CHECK_CUDA(cudaPeekAtLastError());
+
+        cuda_pull_array(l.drop_blocks_scale_gpu, l.drop_blocks_scale, l.batch);
+
+        for (int b = 0; b < l.batch; ++b) {
+            const float prob = l.drop_blocks_scale[b] * block_width * block_width / (float)l.outputs;
+            const float scale = 1.0f / (1.0f - prob);
+            l.drop_blocks_scale[b] = scale;
+            //printf(" , %f , ", scale);
+        }
+
+        cuda_push_array(l.drop_blocks_scale_gpu, l.drop_blocks_scale, l.batch);
+
+        num_blocks = get_number_of_blocks(l.outputs * l.batch, BLOCK);
+        scale_dropblock_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> > (state.input, l.outputs * l.batch, l.outputs, l.drop_blocks_scale_gpu);
+        CHECK_CUDA(cudaPeekAtLastError());
+
+        /*
         const float part_occupied_by_block = block_width * block_height / ((float)l.w * l.h);
         const float prob_place_block = cur_prob / (part_occupied_by_block * max_blocks_per_channel);
 
@@ -93,7 +200,7 @@ void forward_dropout_layer_gpu(dropout_layer l, network_state state)
 
         drop_block_kernel << <cuda_gridsize(size), BLOCK, 0, get_cuda_stream() >> > (state.input, size, l.rand_gpu, l.scale);
         CHECK_CUDA(cudaPeekAtLastError());
-
+        */
     }
     // dropout
     else {
@@ -122,8 +229,32 @@ void backward_dropout_layer_gpu(dropout_layer l, network_state state)
 
     // dropblock
     if (l.dropblock) {
-        drop_block_kernel << <cuda_gridsize(size), BLOCK, 0, get_cuda_stream() >> > (state.delta, size, l.rand_gpu, l.scale);
+        int iteration_num = (*state.net.seen) / (state.net.batch*state.net.subdivisions);
+        float multiplier = 1.0;
+        if (iteration_num < (state.net.max_batches))
+            multiplier = (iteration_num / (float)(state.net.max_batches));
+
+        int block_width = l.dropblock_size_abs * multiplier;
+        int block_height = l.dropblock_size_abs * multiplier;
+
+        if (l.dropblock_size_rel) {
+            block_width = l.dropblock_size_rel * l.w * multiplier;
+            block_height = l.dropblock_size_rel * l.h * multiplier;
+        }
+
+        block_width = max_val_cmp(1, block_width);
+        block_height = max_val_cmp(1, block_height);
+
+        int num_blocks = l.batch * l.c;
+        dropblock_fast_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> > (l.rand_gpu, l.probability, l.w, l.h, l.w*l.h, l.c, block_width, NULL, state.delta);
         CHECK_CUDA(cudaPeekAtLastError());
+
+        num_blocks = get_number_of_blocks(l.outputs * l.batch, BLOCK);
+        scale_dropblock_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> > (state.delta, l.outputs * l.batch, l.outputs, l.drop_blocks_scale_gpu);
+        CHECK_CUDA(cudaPeekAtLastError());
+
+        //drop_block_kernel << <cuda_gridsize(size), BLOCK, 0, get_cuda_stream() >> > (state.delta, size, l.rand_gpu, l.scale);
+        //CHECK_CUDA(cudaPeekAtLastError());
     }
     // dropout
     else {
diff --git a/src/layer.c b/src/layer.c
index d00e2213..4bc84441 100644
--- a/src/layer.c
+++ b/src/layer.c
@@ -55,7 +55,9 @@ void free_layer_custom(layer l, int keep_cudnn_desc)
     if (l.type == DROPOUT) {
         if (l.rand)           free(l.rand);
 #ifdef GPU
-        if (l.rand_gpu)             cuda_free(l.rand_gpu);
+        if (l.rand_gpu)              cuda_free(l.rand_gpu);
+        if (l.drop_blocks_scale)     cuda_free_host(l.drop_blocks_scale);
+        if (l.drop_blocks_scale_gpu) cuda_free(l.drop_blocks_scale_gpu);
 #endif
         return;
     }