From 6fb817f68b471da21c04f8bd5ffef0d3c4d67c30 Mon Sep 17 00:00:00 2001 From: AlexeyAB Date: Sat, 22 Feb 2020 01:04:55 +0300 Subject: [PATCH] Accelerated DropBlock on GPU --- include/darknet.h | 2 + src/dropout_layer.c | 6 +- src/dropout_layer_kernels.cu | 139 ++++++++++++++++++++++++++++++++++- src/layer.c | 4 +- 4 files changed, 144 insertions(+), 7 deletions(-) diff --git a/include/darknet.h b/include/darknet.h index ac50e49e..a13168da 100644 --- a/include/darknet.h +++ b/include/darknet.h @@ -586,6 +586,8 @@ struct layer { float * loss_gpu; float * delta_gpu; float * rand_gpu; + float * drop_blocks_scale; + float * drop_blocks_scale_gpu; float * squared_gpu; float * norms_gpu; diff --git a/src/dropout_layer.c b/src/dropout_layer.c index 395e6a35..3778a7b9 100644 --- a/src/dropout_layer.c +++ b/src/dropout_layer.c @@ -29,11 +29,13 @@ dropout_layer make_dropout_layer(int batch, int inputs, float probability, int d l.scale = 1./(1.0 - probability); l.forward = forward_dropout_layer; l.backward = backward_dropout_layer; - #ifdef GPU +#ifdef GPU l.forward_gpu = forward_dropout_layer_gpu; l.backward_gpu = backward_dropout_layer_gpu; l.rand_gpu = cuda_make_array(l.rand, inputs*batch); - #endif + l.drop_blocks_scale = cuda_make_array_pinned(l.rand, l.batch); + l.drop_blocks_scale_gpu = cuda_make_array(l.rand, l.batch); +#endif if (l.dropblock) { if(l.dropblock_size_abs) fprintf(stderr, "dropblock p = %.2f l.dropblock_size_abs = %d %4d -> %4d\n", probability, l.dropblock_size_abs, inputs, inputs); else fprintf(stderr, "dropblock p = %.2f l.dropblock_size_rel = %.2f %4d -> %4d\n", probability, l.dropblock_size_rel, inputs, inputs); diff --git a/src/dropout_layer_kernels.cu b/src/dropout_layer_kernels.cu index a01bb9c6..1a753e3e 100644 --- a/src/dropout_layer_kernels.cu +++ b/src/dropout_layer_kernels.cu @@ -6,6 +6,88 @@ #include "dropout_layer.h" #include "dark_cuda.h" #include "utils.h" +#include "blas.h" + + +__global__ void dropblock_fast_kernel(float *rand, float prob, int w, int h, int spatial, int filters, int block_size, float *drop_blocks_scale, float *output) +{ + const int threads = BLOCK; + __shared__ int index_block; + + const int id = threadIdx.x; + const int f = blockIdx.x % filters; + const int b = blockIdx.x / filters; + + if(id == 0) index_block = -1; + __syncthreads(); + + int i; + for (i = 0; i < spatial; i += threads) { + int index = b*spatial*f + f*spatial + i + id; + + if (i + id < spatial) { + if (rand[id] < prob) { + index_block = id; + } + } + } + + __syncthreads(); + if (index_block == -1) return; + + + int b_x = index_block % w; + int b_y = index_block / w; + + b_x = max(0, min(b_x, w - block_size)); + b_y = max(0, min(b_y, h - block_size)); + + int block_square_size = block_size * block_size; + + for (i = 0; i < block_square_size; i += threads) + { + int i_x = i % w; + int i_y = i / w; + + int x = b_x + i_x; + int y = b_y + i_y; + + int index = b*spatial*f + f*spatial + y*w + x; + + output[index] = 0; + } + + /* + if (id == 0) { + + for (int x = b_x; x < (b_x + block_size); ++x) + { + for (int y = b_y; y < (b_y + block_size); ++y) + { + int index = b*spatial*f + f*spatial + y*w + x; + + output[index] = 0; + } + } + } + */ + + if (id == 0 && drop_blocks_scale) { + atomicAdd(&drop_blocks_scale[b], 1); + } + +} + + +__global__ void scale_dropblock_kernel(float *output, int size, int outputs, float *drop_blocks_scale) +{ + const int index = blockIdx.x*blockDim.x + threadIdx.x; + if (index >= size) return; + + const int b = size / outputs; + output[index] *= drop_blocks_scale[b]; +} + __global__ void yoloswag420blazeit360noscope(float *input, int size, float *rand, float prob, float scale) { @@ -27,8 +109,8 @@ void forward_dropout_layer_gpu(dropout_layer l, network_state state) // We gradually increase the block size and the probability of dropout - during the first half of the training float multiplier = 1.0; - if(iteration_num < (state.net.max_batches / 2)) - multiplier = (iteration_num / (float)(state.net.max_batches / 2)); + if(iteration_num < (state.net.max_batches)) + multiplier = (iteration_num / (float)(state.net.max_batches)); // dropblock if (l.dropblock) { @@ -47,6 +129,31 @@ void forward_dropout_layer_gpu(dropout_layer l, network_state state) block_width = max_val_cmp(1, block_width); block_height = max_val_cmp(1, block_height); + int size = l.inputs*l.batch; + cuda_random(l.rand_gpu, size); + + fill_ongpu(l.batch, 0, l.drop_blocks_scale_gpu, 1); + + int num_blocks = l.batch * l.c; + dropblock_fast_kernel << > > (l.rand_gpu, cur_prob, l.w, l.h, l.w*l.h, l.c, block_width, l.drop_blocks_scale_gpu, state.input); + CHECK_CUDA(cudaPeekAtLastError()); + + cuda_pull_array(l.drop_blocks_scale_gpu, l.drop_blocks_scale, l.batch); + + for (int b = 0; b < l.batch; ++b) { + const float prob = l.drop_blocks_scale[b] * block_width * block_width / (float)l.outputs; + const float scale = 1.0f / (1.0f - prob); + l.drop_blocks_scale[b] = scale; + //printf(" , %f , ", scale); + } + + cuda_push_array(l.drop_blocks_scale_gpu, l.drop_blocks_scale, l.batch); + + num_blocks = get_number_of_blocks(l.outputs * l.batch, BLOCK); + scale_dropblock_kernel << > > (state.input, l.outputs * l.batch, l.outputs, l.drop_blocks_scale_gpu); + CHECK_CUDA(cudaPeekAtLastError()); + + /* const float part_occupied_by_block = block_width * block_height / ((float)l.w * l.h); const float prob_place_block = cur_prob / (part_occupied_by_block * max_blocks_per_channel); @@ -93,7 +200,7 @@ void forward_dropout_layer_gpu(dropout_layer l, network_state state) drop_block_kernel << > > (state.input, size, l.rand_gpu, l.scale); CHECK_CUDA(cudaPeekAtLastError()); - + */ } // dropout else { @@ -122,8 +229,32 @@ void backward_dropout_layer_gpu(dropout_layer l, network_state state) // dropblock if (l.dropblock) { - drop_block_kernel << > > (state.delta, size, l.rand_gpu, l.scale); + int iteration_num = (*state.net.seen) / (state.net.batch*state.net.subdivisions); + float multiplier = 1.0; + if (iteration_num < (state.net.max_batches)) + multiplier = (iteration_num / (float)(state.net.max_batches)); + + int block_width = l.dropblock_size_abs * multiplier; + int block_height = l.dropblock_size_abs * multiplier; + + if (l.dropblock_size_rel) { + block_width = l.dropblock_size_rel * l.w * multiplier; + block_height = l.dropblock_size_rel * l.h * multiplier; + } + + block_width = max_val_cmp(1, block_width); + block_height = max_val_cmp(1, block_height); + + int num_blocks = l.batch * l.c; + dropblock_fast_kernel << > > (l.rand_gpu, l.probability, l.w, l.h, l.w*l.h, l.c, block_width, NULL, state.delta); CHECK_CUDA(cudaPeekAtLastError()); + + num_blocks = get_number_of_blocks(l.outputs * l.batch, BLOCK); + scale_dropblock_kernel << > > (state.delta, l.outputs * l.batch, l.outputs, l.drop_blocks_scale_gpu); + CHECK_CUDA(cudaPeekAtLastError()); + + //drop_block_kernel << > > (state.delta, size, l.rand_gpu, l.scale); + //CHECK_CUDA(cudaPeekAtLastError()); } // dropout else { diff --git a/src/layer.c b/src/layer.c index d00e2213..4bc84441 100644 --- a/src/layer.c +++ b/src/layer.c @@ -55,7 +55,9 @@ void free_layer_custom(layer l, int keep_cudnn_desc) if (l.type == DROPOUT) { if (l.rand) free(l.rand); #ifdef GPU - if (l.rand_gpu) cuda_free(l.rand_gpu); + if (l.rand_gpu) cuda_free(l.rand_gpu); + if (l.drop_blocks_scale) cuda_free_host(l.drop_blocks_scale); + if (l.drop_blocks_scale_gpu) cuda_free(l.drop_blocks_scale_gpu); #endif return; }