diff --git a/include/darknet.h b/include/darknet.h index d65dd76d..ef085960 100644 --- a/include/darknet.h +++ b/include/darknet.h @@ -254,6 +254,7 @@ struct layer { int sway; int rotate; int stretch; + int stretch_sway; float angle; float jitter; float saturation; diff --git a/src/blas.h b/src/blas.h index 72c4ae9e..ebe7fb0e 100644 --- a/src/blas.h +++ b/src/blas.h @@ -133,6 +133,7 @@ void sam_gpu(float *in_w_h_c, int size, int channel_size, float *scales_c, float void smooth_rotate_weights_gpu(const float *src_weight_gpu, float *weight_deform_gpu, int nweights, int n, int size, int angle, int reverse); void stretch_weights_gpu(const float *src_weight_gpu, float *weight_deform_gpu, int nweights, int n, int size, float scale, int reverse); void sway_and_flip_weights_gpu(const float *src_weight_gpu, float *weight_deform_gpu, int nweights, int n, int size, int angle, int reverse); +void stretch_sway_flip_weights_gpu(const float *src_weight_gpu, float *weight_deform_gpu, int nweights, int n, int size, int angle, int reverse); void rotate_weights_gpu(const float *src_weight_gpu, float *weight_deform_gpu, int nweights, int n, int size, int reverse); void reduce_and_expand_array_gpu(const float *src_gpu, float *dst_gpu, int size, int groups); void expand_array_gpu(const float *src_gpu, float *dst_gpu, int size, int groups); diff --git a/src/blas_kernels.cu b/src/blas_kernels.cu index cdf8a3b3..4e7eab2e 100644 --- a/src/blas_kernels.cu +++ b/src/blas_kernels.cu @@ -1402,6 +1402,7 @@ __global__ void stretch_weights_kernel(const float *src_weight_gpu, float *weig //const float coef = (kernel_size*kernel_size) / (kernel_size*kernel_size - dropout_sum); for (int y = 0; y < kernel_size; ++y) { for (int x = 0; x < kernel_size; ++x) { + //if (scale < 1) weight_deform_gpu[x + y*kernel_size + i] /= scale;// *= coef; weight_deform_gpu[x + y*kernel_size + i] /= scale;// *= coef; } } @@ -1422,11 +1423,6 @@ extern "C" void stretch_weights_gpu(const float *src_weight_gpu, float *weight_d - - - - - __global__ void sway_and_flip_weights_kernel(const float *src_weight_gpu, float *weight_deform_gpu, int nweights, int n, int kernel_size, int angle, int reverse) { const int index = blockIdx.x*blockDim.x + threadIdx.x; @@ -1616,6 +1612,173 @@ extern "C" void rotate_weights_gpu(const float *src_weight_gpu, float *weight_de +__global__ void stretch_sway_flip_weights_kernel(const float *src_weight_gpu, float *weight_deform_gpu, int nweights, int n, int kernel_size, float angle, int reverse) +{ + const int index = blockIdx.x*blockDim.x + threadIdx.x; + const int kernel_area = kernel_size * kernel_size; + const int i = index * kernel_area; + + const int stage_step = (nweights / kernel_area) / 8; // 8 stages + const int stage_id = index / stage_step; + + // nweights = (c / groups) * n * size * size; + // kernel_area = size*size + + if (i < nweights) + { + + if (stage_id == 0) { + // simple copy + for (int x = 0; x < kernel_size; ++x) { + for (int y = 0; y < kernel_size; ++y) { + weight_deform_gpu[x + y*kernel_size + i] = src_weight_gpu[x + y*kernel_size + i]; + } + } + } + else if (stage_id == 1 || stage_id == 2 || stage_id == 3 || stage_id == 4) + { + float scale = 0.5; + if (stage_id == 1) scale = 0.65; + else if (stage_id == 2) scale = 0.8; + else if (stage_id == 3) scale = 1.2; + else if (stage_id == 4) scale = 1.4; + + if (reverse) scale = 1 / scale; + + const int x_c = kernel_size / 2; + const int y_c = kernel_size / 2; + + float dropout_sum = 0; + + for (int y = 0; y < kernel_size; ++y) { + for (int x = 0; x < kernel_size; ++x) { + // Xsource = x_c + (x_d - x_c) / scale + // Ysource = y_c + (y_d - y_c) / scale + + float x_s = x_c + (x - x_c) / scale; + float y_s = y_c + (y - y_c) / scale; + + int x_0 = floor(x_s); // round down + int x_1 = ceil(x_s); // round up + if (x_0 == x_1) x_1 = x_0 + 1; + int y_0 = floor(y_s); + int y_1 = ceil(y_s); + if (y_0 == y_1) y_1 = y_0 + 1; + + float c_x_0 = x_1 - x_s; + float c_x_1 = x_s - x_0; + float c_y_0 = y_1 - y_s; + float c_y_1 = y_s - y_0; + + float val = 0; + if (x_0 >= 0 && x_0 < kernel_size && y_0 >= 0 && y_0 < kernel_size) val += src_weight_gpu[x_0 + y_0*kernel_size + i] * c_x_0 * c_y_0; + else dropout_sum += c_x_0 * c_y_0; + + if (x_1 >= 0 && x_1 < kernel_size && y_0 >= 0 && y_0 < kernel_size) val += src_weight_gpu[x_1 + y_0*kernel_size + i] * c_x_1 * c_y_0; + else dropout_sum += c_x_1 * c_y_0; + + if (x_0 >= 0 && x_0 < kernel_size && y_1 >= 0 && y_1 < kernel_size) val += src_weight_gpu[x_0 + y_1*kernel_size + i] * c_x_0 * c_y_1; + else dropout_sum += c_x_0 * c_y_1; + + if (x_1 >= 0 && x_1 < kernel_size && y_1 >= 0 && y_1 < kernel_size) val += src_weight_gpu[x_1 + y_1*kernel_size + i] * c_x_1 * c_y_1; + else dropout_sum += c_x_1 * c_y_1; + + weight_deform_gpu[x + y*kernel_size + i] = val; + } + } + + // compensate for dropped items + //const float coef = (kernel_size*kernel_size) / (kernel_size*kernel_size - dropout_sum); + for (int y = 0; y < kernel_size; ++y) { + for (int x = 0; x < kernel_size; ++x) { + if(scale > 1) + weight_deform_gpu[x + y*kernel_size + i] *= scale;// *= coef; + } + } + } + else if (stage_id == 5 || stage_id == 6) + { + // rotate left or right + if (stage_id == 6) angle = -angle; + if (reverse) angle = -angle; + + const float cos_a = cosf(angle * 3.14159265 / 180); + const float sin_a = sinf(angle * 3.14159265 / 180); + const int x_c = kernel_size / 2; + const int y_c = kernel_size / 2; + + float dropout_sum = 0; + + for (int y = 0; y < kernel_size; ++y) { + for (int x = 0; x < kernel_size; ++x) { + // Xsource = x*cos(alpha) + y*sin(alpha) + // Ysource = -x*sin(alpha) + y*cos(alpha) + + float x_s = x_c + (x - x_c)*cos_a + (y - y_c)*sin_a; + float y_s = y_c - (x - x_c)*sin_a + (y - y_c)*cos_a; + + int x_0 = floor(x_s); // round down + int x_1 = ceil(x_s); // round up + if (x_0 == x_1) x_1 = x_0 + 1; + int y_0 = floor(y_s); + int y_1 = ceil(y_s); + if (y_0 == y_1) y_1 = y_0 + 1; + + float c_x_0 = x_1 - x_s; + float c_x_1 = x_s - x_0; + float c_y_0 = y_1 - y_s; + float c_y_1 = y_s - y_0; + + float val = 0; + if (x_0 >= 0 && x_0 < kernel_size && y_0 >= 0 && y_0 < kernel_size) val += src_weight_gpu[x_0 + y_0*kernel_size + i] * c_x_0 * c_y_0; + else dropout_sum += c_x_0 * c_y_0; + + if (x_1 >= 0 && x_1 < kernel_size && y_0 >= 0 && y_0 < kernel_size) val += src_weight_gpu[x_1 + y_0*kernel_size + i] * c_x_1 * c_y_0; + else dropout_sum += c_x_1 * c_y_0; + + if (x_0 >= 0 && x_0 < kernel_size && y_1 >= 0 && y_1 < kernel_size) val += src_weight_gpu[x_0 + y_1*kernel_size + i] * c_x_0 * c_y_1; + else dropout_sum += c_x_0 * c_y_1; + + if (x_1 >= 0 && x_1 < kernel_size && y_1 >= 0 && y_1 < kernel_size) val += src_weight_gpu[x_1 + y_1*kernel_size + i] * c_x_1 * c_y_1; + else dropout_sum += c_x_1 * c_y_1; + + weight_deform_gpu[x + y*kernel_size + i] = val; + } + } + + // compensate for dropped items + const float coef = (kernel_size*kernel_size) / (kernel_size*kernel_size - dropout_sum); + for (int y = 0; y < kernel_size; ++y) { + for (int x = 0; x < kernel_size; ++x) { + weight_deform_gpu[x + y*kernel_size + i] *= coef; + } + } + } + else if (stage_id == 7) + { + // flip + for (int y = 0; y < kernel_size; ++y) { + for (int x = 0; x < kernel_size; ++x) { + weight_deform_gpu[(kernel_size - x - 1) + y*kernel_size + i] = src_weight_gpu[x + y*kernel_size + i]; + } + } + } + } +} + + +extern "C" void stretch_sway_flip_weights_gpu(const float *src_weight_gpu, float *weight_deform_gpu, int nweights, int n, int size, int angle, int reverse) +{ + const int kernel_area = size*size; + const int block_size = BLOCK; + const int num_blocks = get_number_of_blocks(nweights / kernel_area, block_size); + stretch_sway_flip_weights_kernel << > > (src_weight_gpu, weight_deform_gpu, nweights, n, size, angle, reverse); + + CHECK_CUDA(cudaPeekAtLastError()); +} + + + __global__ void reduce_and_expand_array_kernel(const float *src_gpu, float *dst_gpu, int current_size, int groups) { const int index = blockIdx.x*blockDim.x + threadIdx.x; diff --git a/src/convolutional_kernels.cu b/src/convolutional_kernels.cu index 45fff784..f31f8488 100644 --- a/src/convolutional_kernels.cu +++ b/src/convolutional_kernels.cu @@ -1210,6 +1210,7 @@ void update_convolutional_layer_gpu(layer l, int batch, float learning_rate_init if (l.rotate) rotate_weights_gpu(l.weight_updates_gpu, l.weight_deform_gpu, l.nweights, l.n, l.size, 1); else if (l.sway) sway_and_flip_weights_gpu(l.weight_updates_gpu, l.weight_deform_gpu, l.nweights, l.n, l.size, l.angle, 1); else if (l.stretch) stretch_weights_gpu(l.weight_updates_gpu, l.weight_deform_gpu, l.nweights, l.n, l.size, 0, 1); + else if (l.stretch_sway) stretch_sway_flip_weights_gpu(l.weight_updates_gpu, l.weight_deform_gpu, l.nweights, l.n, l.size, l.angle, 1); //simple_copy_ongpu(l.nweights, l.weight_updates_gpu, l.weight_deform_gpu); @@ -1268,6 +1269,7 @@ void update_convolutional_layer_gpu(layer l, int batch, float learning_rate_init if (l.rotate) rotate_weights_gpu(l.weight_deform_gpu, l.weights_gpu, l.nweights, l.n, l.size, 0); else if (l.sway) sway_and_flip_weights_gpu(l.weight_deform_gpu, l.weights_gpu, l.nweights, l.n, l.size, l.angle, 0); else if (l.stretch) stretch_weights_gpu(l.weight_deform_gpu, l.weights_gpu, l.nweights, l.n, l.size, 0, 0); + else if (l.stretch_sway) stretch_sway_flip_weights_gpu(l.weight_deform_gpu, l.weights_gpu, l.nweights, l.n, l.size, l.angle, 0); //printf(" angle = %f, reverse = %d \n", l.angle, 0); //cuda_pull_array(l.weights_gpu, l.weights, l.nweights); diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c index cc05b3b0..d912e950 100644 --- a/src/convolutional_layer.c +++ b/src/convolutional_layer.c @@ -232,7 +232,7 @@ void cudnn_convolutional_setup(layer *l, int cudnn_preference, size_t workspace_ // 3. FP32 Master Copy of Weights // More: http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#tensor_ops CHECK_CUDNN(cudnnSetConvolutionGroupCount(l->convDesc, l->groups)); - if (l->groups == 1) CHECK_CUDNN(cudnnSetConvolutionMathType(l->convDesc, CUDNN_TENSOR_OP_MATH)); + //if (l->groups == 1) CHECK_CUDNN(cudnnSetConvolutionMathType(l->convDesc, CUDNN_TENSOR_OP_MATH)); #if((CUDNN_MAJOR*10 + CUDNN_MINOR) >= 72) // cuDNN >= 7.2 //CHECK_CUDNN(cudnnSetConvolutionMathType(l->convDesc, CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION)); // reduces the speed of regular and group convolution #endif diff --git a/src/data.c b/src/data.c index b84ae7c2..883585fc 100644 --- a/src/data.c +++ b/src/data.c @@ -1628,7 +1628,9 @@ data load_data_augment(char **paths, int n, int m, char **labels, int k, tree *h } for (j = 0; j < d.y.cols; ++j) { - d.y.vals[i][j] = d.y.vals[i][j] * s1 + d2.y.vals[i][j] * s2 + d3.y.vals[i][j] * s3 + d4.y.vals[i][j] * s4; + const float max_s = 1;// max_val_cmp(s1, max_val_cmp(s2, max_val_cmp(s3, s4))); + + d.y.vals[i][j] = d.y.vals[i][j] * s1 / max_s + d2.y.vals[i][j] * s2 / max_s + d3.y.vals[i][j] * s3 / max_s + d4.y.vals[i][j] * s4 / max_s; } } } diff --git a/src/parser.c b/src/parser.c index 8da5189e..94d23a30 100644 --- a/src/parser.c +++ b/src/parser.c @@ -204,11 +204,12 @@ convolutional_layer parse_convolutional(list *options, size_params params) int sway = option_find_int_quiet(options, "sway", 0); int rotate = option_find_int_quiet(options, "rotate", 0); int stretch = option_find_int_quiet(options, "stretch", 0); - if ((sway + rotate + stretch) > 1) { + int stretch_sway = option_find_int_quiet(options, "stretch_sway", 0); + if ((sway + rotate + stretch + stretch_sway) > 1) { printf(" Error: should be used only 1 param: sway=1, rotate=1 or stretch=1 in the [convolutional] layer \n"); exit(0); } - int deform = sway || rotate || stretch; + int deform = sway || rotate || stretch || stretch_sway; if (deform && size == 1) { printf(" Error: params (sway=1, rotate=1 or stretch=1) should be used only with size >=3 in the [convolutional] layer \n"); exit(0); @@ -220,6 +221,7 @@ convolutional_layer parse_convolutional(list *options, size_params params) layer.sway = sway; layer.rotate = rotate; layer.stretch = stretch; + layer.stretch_sway = stretch_sway; layer.angle = option_find_float_quiet(options, "angle", 15); if(params.net.adam){ @@ -778,7 +780,7 @@ dropout_layer parse_dropout(list *options, size_params params) printf(" [dropout] - dropblock_size_abs = %d that is bigger than layer size %d x %d \n", dropblock_size_abs, params.w, params.h); dropblock_size_abs = min_val_cmp(params.w, params.h); } - if (!dropblock_size_rel && !dropblock_size_abs) { + if (dropblock && !dropblock_size_rel && !dropblock_size_abs) { printf(" [dropout] - None of the parameters (dropblock_size_rel or dropblock_size_abs) are set, will be used: dropblock_size_abs = 7 \n"); dropblock_size_abs = 7; }