Accelerated DropBlock on GPU

This commit is contained in:
AlexeyAB 2020-02-22 01:04:55 +03:00
parent 0ee9c8668f
commit 6fb817f68b
4 changed files with 144 additions and 7 deletions

View File

@ -586,6 +586,8 @@ struct layer {
float * loss_gpu;
float * delta_gpu;
float * rand_gpu;
float * drop_blocks_scale;
float * drop_blocks_scale_gpu;
float * squared_gpu;
float * norms_gpu;

View File

@ -29,11 +29,13 @@ dropout_layer make_dropout_layer(int batch, int inputs, float probability, int d
l.scale = 1./(1.0 - probability);
l.forward = forward_dropout_layer;
l.backward = backward_dropout_layer;
#ifdef GPU
#ifdef GPU
l.forward_gpu = forward_dropout_layer_gpu;
l.backward_gpu = backward_dropout_layer_gpu;
l.rand_gpu = cuda_make_array(l.rand, inputs*batch);
#endif
l.drop_blocks_scale = cuda_make_array_pinned(l.rand, l.batch);
l.drop_blocks_scale_gpu = cuda_make_array(l.rand, l.batch);
#endif
if (l.dropblock) {
if(l.dropblock_size_abs) fprintf(stderr, "dropblock p = %.2f l.dropblock_size_abs = %d %4d -> %4d\n", probability, l.dropblock_size_abs, inputs, inputs);
else fprintf(stderr, "dropblock p = %.2f l.dropblock_size_rel = %.2f %4d -> %4d\n", probability, l.dropblock_size_rel, inputs, inputs);

View File

@ -6,6 +6,88 @@
#include "dropout_layer.h"
#include "dark_cuda.h"
#include "utils.h"
#include "blas.h"
__global__ void dropblock_fast_kernel(float *rand, float prob, int w, int h, int spatial, int filters, int block_size, float *drop_blocks_scale, float *output)
{
const int threads = BLOCK;
__shared__ int index_block;
const int id = threadIdx.x;
const int f = blockIdx.x % filters;
const int b = blockIdx.x / filters;
if(id == 0) index_block = -1;
__syncthreads();
int i;
for (i = 0; i < spatial; i += threads) {
int index = b*spatial*f + f*spatial + i + id;
if (i + id < spatial) {
if (rand[id] < prob) {
index_block = id;
}
}
}
__syncthreads();
if (index_block == -1) return;
int b_x = index_block % w;
int b_y = index_block / w;
b_x = max(0, min(b_x, w - block_size));
b_y = max(0, min(b_y, h - block_size));
int block_square_size = block_size * block_size;
for (i = 0; i < block_square_size; i += threads)
{
int i_x = i % w;
int i_y = i / w;
int x = b_x + i_x;
int y = b_y + i_y;
int index = b*spatial*f + f*spatial + y*w + x;
output[index] = 0;
}
/*
if (id == 0) {
for (int x = b_x; x < (b_x + block_size); ++x)
{
for (int y = b_y; y < (b_y + block_size); ++y)
{
int index = b*spatial*f + f*spatial + y*w + x;
output[index] = 0;
}
}
}
*/
if (id == 0 && drop_blocks_scale) {
atomicAdd(&drop_blocks_scale[b], 1);
}
}
__global__ void scale_dropblock_kernel(float *output, int size, int outputs, float *drop_blocks_scale)
{
const int index = blockIdx.x*blockDim.x + threadIdx.x;
if (index >= size) return;
const int b = size / outputs;
output[index] *= drop_blocks_scale[b];
}
__global__ void yoloswag420blazeit360noscope(float *input, int size, float *rand, float prob, float scale)
{
@ -27,8 +109,8 @@ void forward_dropout_layer_gpu(dropout_layer l, network_state state)
// We gradually increase the block size and the probability of dropout - during the first half of the training
float multiplier = 1.0;
if(iteration_num < (state.net.max_batches / 2))
multiplier = (iteration_num / (float)(state.net.max_batches / 2));
if(iteration_num < (state.net.max_batches))
multiplier = (iteration_num / (float)(state.net.max_batches));
// dropblock
if (l.dropblock) {
@ -47,6 +129,31 @@ void forward_dropout_layer_gpu(dropout_layer l, network_state state)
block_width = max_val_cmp(1, block_width);
block_height = max_val_cmp(1, block_height);
int size = l.inputs*l.batch;
cuda_random(l.rand_gpu, size);
fill_ongpu(l.batch, 0, l.drop_blocks_scale_gpu, 1);
int num_blocks = l.batch * l.c;
dropblock_fast_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> > (l.rand_gpu, cur_prob, l.w, l.h, l.w*l.h, l.c, block_width, l.drop_blocks_scale_gpu, state.input);
CHECK_CUDA(cudaPeekAtLastError());
cuda_pull_array(l.drop_blocks_scale_gpu, l.drop_blocks_scale, l.batch);
for (int b = 0; b < l.batch; ++b) {
const float prob = l.drop_blocks_scale[b] * block_width * block_width / (float)l.outputs;
const float scale = 1.0f / (1.0f - prob);
l.drop_blocks_scale[b] = scale;
//printf(" , %f , ", scale);
}
cuda_push_array(l.drop_blocks_scale_gpu, l.drop_blocks_scale, l.batch);
num_blocks = get_number_of_blocks(l.outputs * l.batch, BLOCK);
scale_dropblock_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> > (state.input, l.outputs * l.batch, l.outputs, l.drop_blocks_scale_gpu);
CHECK_CUDA(cudaPeekAtLastError());
/*
const float part_occupied_by_block = block_width * block_height / ((float)l.w * l.h);
const float prob_place_block = cur_prob / (part_occupied_by_block * max_blocks_per_channel);
@ -93,7 +200,7 @@ void forward_dropout_layer_gpu(dropout_layer l, network_state state)
drop_block_kernel << <cuda_gridsize(size), BLOCK, 0, get_cuda_stream() >> > (state.input, size, l.rand_gpu, l.scale);
CHECK_CUDA(cudaPeekAtLastError());
*/
}
// dropout
else {
@ -122,8 +229,32 @@ void backward_dropout_layer_gpu(dropout_layer l, network_state state)
// dropblock
if (l.dropblock) {
drop_block_kernel << <cuda_gridsize(size), BLOCK, 0, get_cuda_stream() >> > (state.delta, size, l.rand_gpu, l.scale);
int iteration_num = (*state.net.seen) / (state.net.batch*state.net.subdivisions);
float multiplier = 1.0;
if (iteration_num < (state.net.max_batches))
multiplier = (iteration_num / (float)(state.net.max_batches));
int block_width = l.dropblock_size_abs * multiplier;
int block_height = l.dropblock_size_abs * multiplier;
if (l.dropblock_size_rel) {
block_width = l.dropblock_size_rel * l.w * multiplier;
block_height = l.dropblock_size_rel * l.h * multiplier;
}
block_width = max_val_cmp(1, block_width);
block_height = max_val_cmp(1, block_height);
int num_blocks = l.batch * l.c;
dropblock_fast_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> > (l.rand_gpu, l.probability, l.w, l.h, l.w*l.h, l.c, block_width, NULL, state.delta);
CHECK_CUDA(cudaPeekAtLastError());
num_blocks = get_number_of_blocks(l.outputs * l.batch, BLOCK);
scale_dropblock_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> > (state.delta, l.outputs * l.batch, l.outputs, l.drop_blocks_scale_gpu);
CHECK_CUDA(cudaPeekAtLastError());
//drop_block_kernel << <cuda_gridsize(size), BLOCK, 0, get_cuda_stream() >> > (state.delta, size, l.rand_gpu, l.scale);
//CHECK_CUDA(cudaPeekAtLastError());
}
// dropout
else {

View File

@ -55,7 +55,9 @@ void free_layer_custom(layer l, int keep_cudnn_desc)
if (l.type == DROPOUT) {
if (l.rand) free(l.rand);
#ifdef GPU
if (l.rand_gpu) cuda_free(l.rand_gpu);
if (l.rand_gpu) cuda_free(l.rand_gpu);
if (l.drop_blocks_scale) cuda_free_host(l.drop_blocks_scale);
if (l.drop_blocks_scale_gpu) cuda_free(l.drop_blocks_scale_gpu);
#endif
return;
}