mirror of https://github.com/AlexeyAB/darknet.git
Accelerated DropBlock on GPU
This commit is contained in:
parent
0ee9c8668f
commit
6fb817f68b
|
@ -586,6 +586,8 @@ struct layer {
|
|||
float * loss_gpu;
|
||||
float * delta_gpu;
|
||||
float * rand_gpu;
|
||||
float * drop_blocks_scale;
|
||||
float * drop_blocks_scale_gpu;
|
||||
float * squared_gpu;
|
||||
float * norms_gpu;
|
||||
|
||||
|
|
|
@ -29,11 +29,13 @@ dropout_layer make_dropout_layer(int batch, int inputs, float probability, int d
|
|||
l.scale = 1./(1.0 - probability);
|
||||
l.forward = forward_dropout_layer;
|
||||
l.backward = backward_dropout_layer;
|
||||
#ifdef GPU
|
||||
#ifdef GPU
|
||||
l.forward_gpu = forward_dropout_layer_gpu;
|
||||
l.backward_gpu = backward_dropout_layer_gpu;
|
||||
l.rand_gpu = cuda_make_array(l.rand, inputs*batch);
|
||||
#endif
|
||||
l.drop_blocks_scale = cuda_make_array_pinned(l.rand, l.batch);
|
||||
l.drop_blocks_scale_gpu = cuda_make_array(l.rand, l.batch);
|
||||
#endif
|
||||
if (l.dropblock) {
|
||||
if(l.dropblock_size_abs) fprintf(stderr, "dropblock p = %.2f l.dropblock_size_abs = %d %4d -> %4d\n", probability, l.dropblock_size_abs, inputs, inputs);
|
||||
else fprintf(stderr, "dropblock p = %.2f l.dropblock_size_rel = %.2f %4d -> %4d\n", probability, l.dropblock_size_rel, inputs, inputs);
|
||||
|
|
|
@ -6,6 +6,88 @@
|
|||
#include "dropout_layer.h"
|
||||
#include "dark_cuda.h"
|
||||
#include "utils.h"
|
||||
#include "blas.h"
|
||||
|
||||
|
||||
__global__ void dropblock_fast_kernel(float *rand, float prob, int w, int h, int spatial, int filters, int block_size, float *drop_blocks_scale, float *output)
|
||||
{
|
||||
const int threads = BLOCK;
|
||||
__shared__ int index_block;
|
||||
|
||||
const int id = threadIdx.x;
|
||||
const int f = blockIdx.x % filters;
|
||||
const int b = blockIdx.x / filters;
|
||||
|
||||
if(id == 0) index_block = -1;
|
||||
__syncthreads();
|
||||
|
||||
int i;
|
||||
for (i = 0; i < spatial; i += threads) {
|
||||
int index = b*spatial*f + f*spatial + i + id;
|
||||
|
||||
if (i + id < spatial) {
|
||||
if (rand[id] < prob) {
|
||||
index_block = id;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
if (index_block == -1) return;
|
||||
|
||||
|
||||
int b_x = index_block % w;
|
||||
int b_y = index_block / w;
|
||||
|
||||
b_x = max(0, min(b_x, w - block_size));
|
||||
b_y = max(0, min(b_y, h - block_size));
|
||||
|
||||
int block_square_size = block_size * block_size;
|
||||
|
||||
for (i = 0; i < block_square_size; i += threads)
|
||||
{
|
||||
int i_x = i % w;
|
||||
int i_y = i / w;
|
||||
|
||||
int x = b_x + i_x;
|
||||
int y = b_y + i_y;
|
||||
|
||||
int index = b*spatial*f + f*spatial + y*w + x;
|
||||
|
||||
output[index] = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
if (id == 0) {
|
||||
|
||||
for (int x = b_x; x < (b_x + block_size); ++x)
|
||||
{
|
||||
for (int y = b_y; y < (b_y + block_size); ++y)
|
||||
{
|
||||
int index = b*spatial*f + f*spatial + y*w + x;
|
||||
|
||||
output[index] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
if (id == 0 && drop_blocks_scale) {
|
||||
atomicAdd(&drop_blocks_scale[b], 1);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
__global__ void scale_dropblock_kernel(float *output, int size, int outputs, float *drop_blocks_scale)
|
||||
{
|
||||
const int index = blockIdx.x*blockDim.x + threadIdx.x;
|
||||
if (index >= size) return;
|
||||
|
||||
const int b = size / outputs;
|
||||
output[index] *= drop_blocks_scale[b];
|
||||
}
|
||||
|
||||
|
||||
__global__ void yoloswag420blazeit360noscope(float *input, int size, float *rand, float prob, float scale)
|
||||
{
|
||||
|
@ -27,8 +109,8 @@ void forward_dropout_layer_gpu(dropout_layer l, network_state state)
|
|||
|
||||
// We gradually increase the block size and the probability of dropout - during the first half of the training
|
||||
float multiplier = 1.0;
|
||||
if(iteration_num < (state.net.max_batches / 2))
|
||||
multiplier = (iteration_num / (float)(state.net.max_batches / 2));
|
||||
if(iteration_num < (state.net.max_batches))
|
||||
multiplier = (iteration_num / (float)(state.net.max_batches));
|
||||
|
||||
// dropblock
|
||||
if (l.dropblock) {
|
||||
|
@ -47,6 +129,31 @@ void forward_dropout_layer_gpu(dropout_layer l, network_state state)
|
|||
block_width = max_val_cmp(1, block_width);
|
||||
block_height = max_val_cmp(1, block_height);
|
||||
|
||||
int size = l.inputs*l.batch;
|
||||
cuda_random(l.rand_gpu, size);
|
||||
|
||||
fill_ongpu(l.batch, 0, l.drop_blocks_scale_gpu, 1);
|
||||
|
||||
int num_blocks = l.batch * l.c;
|
||||
dropblock_fast_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> > (l.rand_gpu, cur_prob, l.w, l.h, l.w*l.h, l.c, block_width, l.drop_blocks_scale_gpu, state.input);
|
||||
CHECK_CUDA(cudaPeekAtLastError());
|
||||
|
||||
cuda_pull_array(l.drop_blocks_scale_gpu, l.drop_blocks_scale, l.batch);
|
||||
|
||||
for (int b = 0; b < l.batch; ++b) {
|
||||
const float prob = l.drop_blocks_scale[b] * block_width * block_width / (float)l.outputs;
|
||||
const float scale = 1.0f / (1.0f - prob);
|
||||
l.drop_blocks_scale[b] = scale;
|
||||
//printf(" , %f , ", scale);
|
||||
}
|
||||
|
||||
cuda_push_array(l.drop_blocks_scale_gpu, l.drop_blocks_scale, l.batch);
|
||||
|
||||
num_blocks = get_number_of_blocks(l.outputs * l.batch, BLOCK);
|
||||
scale_dropblock_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> > (state.input, l.outputs * l.batch, l.outputs, l.drop_blocks_scale_gpu);
|
||||
CHECK_CUDA(cudaPeekAtLastError());
|
||||
|
||||
/*
|
||||
const float part_occupied_by_block = block_width * block_height / ((float)l.w * l.h);
|
||||
const float prob_place_block = cur_prob / (part_occupied_by_block * max_blocks_per_channel);
|
||||
|
||||
|
@ -93,7 +200,7 @@ void forward_dropout_layer_gpu(dropout_layer l, network_state state)
|
|||
|
||||
drop_block_kernel << <cuda_gridsize(size), BLOCK, 0, get_cuda_stream() >> > (state.input, size, l.rand_gpu, l.scale);
|
||||
CHECK_CUDA(cudaPeekAtLastError());
|
||||
|
||||
*/
|
||||
}
|
||||
// dropout
|
||||
else {
|
||||
|
@ -122,8 +229,32 @@ void backward_dropout_layer_gpu(dropout_layer l, network_state state)
|
|||
|
||||
// dropblock
|
||||
if (l.dropblock) {
|
||||
drop_block_kernel << <cuda_gridsize(size), BLOCK, 0, get_cuda_stream() >> > (state.delta, size, l.rand_gpu, l.scale);
|
||||
int iteration_num = (*state.net.seen) / (state.net.batch*state.net.subdivisions);
|
||||
float multiplier = 1.0;
|
||||
if (iteration_num < (state.net.max_batches))
|
||||
multiplier = (iteration_num / (float)(state.net.max_batches));
|
||||
|
||||
int block_width = l.dropblock_size_abs * multiplier;
|
||||
int block_height = l.dropblock_size_abs * multiplier;
|
||||
|
||||
if (l.dropblock_size_rel) {
|
||||
block_width = l.dropblock_size_rel * l.w * multiplier;
|
||||
block_height = l.dropblock_size_rel * l.h * multiplier;
|
||||
}
|
||||
|
||||
block_width = max_val_cmp(1, block_width);
|
||||
block_height = max_val_cmp(1, block_height);
|
||||
|
||||
int num_blocks = l.batch * l.c;
|
||||
dropblock_fast_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> > (l.rand_gpu, l.probability, l.w, l.h, l.w*l.h, l.c, block_width, NULL, state.delta);
|
||||
CHECK_CUDA(cudaPeekAtLastError());
|
||||
|
||||
num_blocks = get_number_of_blocks(l.outputs * l.batch, BLOCK);
|
||||
scale_dropblock_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> > (state.delta, l.outputs * l.batch, l.outputs, l.drop_blocks_scale_gpu);
|
||||
CHECK_CUDA(cudaPeekAtLastError());
|
||||
|
||||
//drop_block_kernel << <cuda_gridsize(size), BLOCK, 0, get_cuda_stream() >> > (state.delta, size, l.rand_gpu, l.scale);
|
||||
//CHECK_CUDA(cudaPeekAtLastError());
|
||||
}
|
||||
// dropout
|
||||
else {
|
||||
|
|
|
@ -55,7 +55,9 @@ void free_layer_custom(layer l, int keep_cudnn_desc)
|
|||
if (l.type == DROPOUT) {
|
||||
if (l.rand) free(l.rand);
|
||||
#ifdef GPU
|
||||
if (l.rand_gpu) cuda_free(l.rand_gpu);
|
||||
if (l.rand_gpu) cuda_free(l.rand_gpu);
|
||||
if (l.drop_blocks_scale) cuda_free_host(l.drop_blocks_scale);
|
||||
if (l.drop_blocks_scale_gpu) cuda_free(l.drop_blocks_scale_gpu);
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue