Accelerated DropBlock on GPU

2020-02-22 01:04:55 +03:00 · 2020-02-22 01:04:55 +03:00 · 6fb817f68b
parent 0ee9c8668f
commit 6fb817f68b
4 changed files with 144 additions and 7 deletions
--- a/include/darknet.h
+++ b/include/darknet.h
@ -586,6 +586,8 @@ struct layer {
    float * loss_gpu;
    float * delta_gpu;
    float * rand_gpu;
+    float * drop_blocks_scale;
+    float * drop_blocks_scale_gpu;
    float * squared_gpu;
    float * norms_gpu;

--- a/src/dropout_layer.c
+++ b/src/dropout_layer.c
@ -29,11 +29,13 @@ dropout_layer make_dropout_layer(int batch, int inputs, float probability, int d
    l.scale = 1./(1.0 - probability);
    l.forward = forward_dropout_layer;
    l.backward = backward_dropout_layer;
-    #ifdef GPU
+#ifdef GPU
    l.forward_gpu = forward_dropout_layer_gpu;
    l.backward_gpu = backward_dropout_layer_gpu;
    l.rand_gpu = cuda_make_array(l.rand, inputs*batch);
-    #endif
+    l.drop_blocks_scale = cuda_make_array_pinned(l.rand, l.batch);
+    l.drop_blocks_scale_gpu = cuda_make_array(l.rand, l.batch);
+#endif
    if (l.dropblock) {
        if(l.dropblock_size_abs) fprintf(stderr, "dropblock       p = %.2f   l.dropblock_size_abs = %d         %4d  ->   %4d\n", probability, l.dropblock_size_abs, inputs, inputs);
        else fprintf(stderr, "dropblock       p = %.2f   l.dropblock_size_rel = %.2f         %4d  ->   %4d\n", probability, l.dropblock_size_rel, inputs, inputs);
--- a/src/dropout_layer_kernels.cu
+++ b/src/dropout_layer_kernels.cu
@ -6,6 +6,88 @@
 #include "dropout_layer.h"
 #include "dark_cuda.h"
 #include "utils.h"
+#include "blas.h"
+
+
+__global__ void dropblock_fast_kernel(float *rand, float prob, int w, int h, int spatial, int filters, int block_size, float *drop_blocks_scale, float *output)
+{
+    const int threads = BLOCK;
+    __shared__ int index_block;
+
+    const int id = threadIdx.x;
+    const int f = blockIdx.x % filters;
+    const int b = blockIdx.x / filters;
+
+    if(id == 0) index_block = -1;
+    __syncthreads();
+
+    int i;
+    for (i = 0; i < spatial; i += threads) {
+        int index = b*spatial*f + f*spatial + i + id;
+
+        if (i + id < spatial) {
+            if (rand[id] < prob) {
+                index_block = id;
+            }
+        }
+    }
+
+    __syncthreads();
+    if (index_block == -1) return;
+
+
+    int b_x = index_block % w;
+    int b_y = index_block / w;
+
+    b_x = max(0, min(b_x, w - block_size));
+    b_y = max(0, min(b_y, h - block_size));
+
+    int block_square_size = block_size * block_size;
+
+    for (i = 0; i < block_square_size; i += threads)
+    {
+        int i_x = i % w;
+        int i_y = i / w;
+
+        int x = b_x + i_x;
+        int y = b_y + i_y;
+
+        int index = b*spatial*f + f*spatial + y*w + x;
+
+        output[index] = 0;
+    }
+
+    /*
+    if (id == 0) {
+
+        for (int x = b_x; x < (b_x + block_size); ++x)
+        {
+            for (int y = b_y; y < (b_y + block_size); ++y)
+            {
+                int index = b*spatial*f + f*spatial + y*w + x;
+
+                output[index] = 0;
+            }
+        }
+    }
+    */
+
+    if (id == 0 && drop_blocks_scale) {
+        atomicAdd(&drop_blocks_scale[b], 1);
+    }
+
+}
+
+
+__global__ void scale_dropblock_kernel(float *output, int size, int outputs, float *drop_blocks_scale)
+{
+    const int index = blockIdx.x*blockDim.x + threadIdx.x;
+    if (index >= size) return;
+
+    const int b = size / outputs;
+    output[index] *= drop_blocks_scale[b];
+}
+

 __global__ void yoloswag420blazeit360noscope(float *input, int size, float *rand, float prob, float scale)
 {
@ -27,8 +109,8 @@ void forward_dropout_layer_gpu(dropout_layer l, network_state state)

    // We gradually increase the block size and the probability of dropout - during the first half of the training
    float multiplier = 1.0;
-    if(iteration_num < (state.net.max_batches / 2))
-        multiplier = (iteration_num / (float)(state.net.max_batches / 2));
+    if(iteration_num < (state.net.max_batches))
+        multiplier = (iteration_num / (float)(state.net.max_batches));

    // dropblock
    if (l.dropblock) {
@ -47,6 +129,31 @@ void forward_dropout_layer_gpu(dropout_layer l, network_state state)
        block_width = max_val_cmp(1, block_width);
        block_height = max_val_cmp(1, block_height);

+        int size = l.inputs*l.batch;
+        cuda_random(l.rand_gpu, size);
+
+        fill_ongpu(l.batch, 0, l.drop_blocks_scale_gpu, 1);
+
+        int num_blocks = l.batch * l.c;
+        dropblock_fast_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> > (l.rand_gpu, cur_prob, l.w, l.h, l.w*l.h, l.c, block_width, l.drop_blocks_scale_gpu, state.input);
+        CHECK_CUDA(cudaPeekAtLastError());
+
+        cuda_pull_array(l.drop_blocks_scale_gpu, l.drop_blocks_scale, l.batch);
+
+        for (int b = 0; b < l.batch; ++b) {
+            const float prob = l.drop_blocks_scale[b] * block_width * block_width / (float)l.outputs;
+            const float scale = 1.0f / (1.0f - prob);
+            l.drop_blocks_scale[b] = scale;
+            //printf(" , %f , ", scale);
+        }
+
+        cuda_push_array(l.drop_blocks_scale_gpu, l.drop_blocks_scale, l.batch);
+
+        num_blocks = get_number_of_blocks(l.outputs * l.batch, BLOCK);
+        scale_dropblock_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> > (state.input, l.outputs * l.batch, l.outputs, l.drop_blocks_scale_gpu);
+        CHECK_CUDA(cudaPeekAtLastError());
+
+        /*
        const float part_occupied_by_block = block_width * block_height / ((float)l.w * l.h);
        const float prob_place_block = cur_prob / (part_occupied_by_block * max_blocks_per_channel);

@ -93,7 +200,7 @@ void forward_dropout_layer_gpu(dropout_layer l, network_state state)

        drop_block_kernel << <cuda_gridsize(size), BLOCK, 0, get_cuda_stream() >> > (state.input, size, l.rand_gpu, l.scale);
        CHECK_CUDA(cudaPeekAtLastError());
-
+        */
    }
    // dropout
    else {
@ -122,8 +229,32 @@ void backward_dropout_layer_gpu(dropout_layer l, network_state state)

    // dropblock
    if (l.dropblock) {
-        drop_block_kernel << <cuda_gridsize(size), BLOCK, 0, get_cuda_stream() >> > (state.delta, size, l.rand_gpu, l.scale);
+        int iteration_num = (*state.net.seen) / (state.net.batch*state.net.subdivisions);
+        float multiplier = 1.0;
+        if (iteration_num < (state.net.max_batches))
+            multiplier = (iteration_num / (float)(state.net.max_batches));
+
+        int block_width = l.dropblock_size_abs * multiplier;
+        int block_height = l.dropblock_size_abs * multiplier;
+
+        if (l.dropblock_size_rel) {
+            block_width = l.dropblock_size_rel * l.w * multiplier;
+            block_height = l.dropblock_size_rel * l.h * multiplier;
+        }
+
+        block_width = max_val_cmp(1, block_width);
+        block_height = max_val_cmp(1, block_height);
+
+        int num_blocks = l.batch * l.c;
+        dropblock_fast_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> > (l.rand_gpu, l.probability, l.w, l.h, l.w*l.h, l.c, block_width, NULL, state.delta);
        CHECK_CUDA(cudaPeekAtLastError());
+
+        num_blocks = get_number_of_blocks(l.outputs * l.batch, BLOCK);
+        scale_dropblock_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> > (state.delta, l.outputs * l.batch, l.outputs, l.drop_blocks_scale_gpu);
+        CHECK_CUDA(cudaPeekAtLastError());
+
+        //drop_block_kernel << <cuda_gridsize(size), BLOCK, 0, get_cuda_stream() >> > (state.delta, size, l.rand_gpu, l.scale);
+        //CHECK_CUDA(cudaPeekAtLastError());
    }
    // dropout
    else {
--- a/src/layer.c
+++ b/src/layer.c
@ -55,7 +55,9 @@ void free_layer_custom(layer l, int keep_cudnn_desc)
    if (l.type == DROPOUT) {
        if (l.rand)           free(l.rand);
 #ifdef GPU
-        if (l.rand_gpu)             cuda_free(l.rand_gpu);
+        if (l.rand_gpu)              cuda_free(l.rand_gpu);
+        if (l.drop_blocks_scale)     cuda_free_host(l.drop_blocks_scale);
+        if (l.drop_blocks_scale_gpu) cuda_free(l.drop_blocks_scale_gpu);
 #endif
        return;
    }