mirror of https://github.com/AlexeyAB/darknet.git
Minor performance improvement
This commit is contained in:
parent
40006b6e9b
commit
cfac917b75
|
@ -420,7 +420,7 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
|
||||||
//if (state.use_mixed_precision) {
|
//if (state.use_mixed_precision) {
|
||||||
int iteration_num = (*state.net.seen) / (state.net.batch*state.net.subdivisions);
|
int iteration_num = (*state.net.seen) / (state.net.batch*state.net.subdivisions);
|
||||||
if (state.index != 0 && state.net.cudnn_half && !l.xnor && (!state.train || iteration_num > 3*state.net.burn_in) &&
|
if (state.index != 0 && state.net.cudnn_half && !l.xnor && (!state.train || iteration_num > 3*state.net.burn_in) &&
|
||||||
(l.c / l.groups) % 8 == 0 && l.n % 8 == 0 && !state.train && l.groups <= 8)
|
(l.c / l.groups) % 8 == 0 && l.n % 8 == 0 && !state.train && l.groups <= 8 && l.size > 1)
|
||||||
{
|
{
|
||||||
//printf("\n CUDNN_HALF!!! state.index = %d \n", state.index);
|
//printf("\n CUDNN_HALF!!! state.index = %d \n", state.index);
|
||||||
|
|
||||||
|
@ -671,7 +671,7 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state
|
||||||
//#ifdef CUDNN_HALF
|
//#ifdef CUDNN_HALF
|
||||||
int iteration_num = (*state.net.seen) / (state.net.batch*state.net.subdivisions);
|
int iteration_num = (*state.net.seen) / (state.net.batch*state.net.subdivisions);
|
||||||
if (state.index != 0 && state.net.cudnn_half && !l.xnor && (!state.train || iteration_num > 3*state.net.burn_in) &&
|
if (state.index != 0 && state.net.cudnn_half && !l.xnor && (!state.train || iteration_num > 3*state.net.burn_in) &&
|
||||||
(l.c / l.groups) % 8 == 0 && l.n % 8 == 0 && !state.train && l.groups <= 8)
|
(l.c / l.groups) % 8 == 0 && l.n % 8 == 0 && !state.train && l.groups <= 8 && l.size > 1)
|
||||||
{
|
{
|
||||||
const size_t input16_size = l.batch*l.c*l.w*l.h;
|
const size_t input16_size = l.batch*l.c*l.w*l.h;
|
||||||
const size_t delta16_size = l.batch*l.n*l.out_w*l.out_h;
|
const size_t delta16_size = l.batch*l.n*l.out_w*l.out_h;
|
||||||
|
|
|
@ -865,8 +865,8 @@ layer parse_shortcut(list *options, size_params params, network net)
|
||||||
assert(params.w == net.layers[index].out_w && params.h == net.layers[index].out_h);
|
assert(params.w == net.layers[index].out_w && params.h == net.layers[index].out_h);
|
||||||
|
|
||||||
if (params.w != net.layers[index].out_w || params.h != net.layers[index].out_h || params.c != net.layers[index].out_c)
|
if (params.w != net.layers[index].out_w || params.h != net.layers[index].out_h || params.c != net.layers[index].out_c)
|
||||||
fprintf(stderr, " w = %d, w2 = %d, h = %d, h2 = %d, c = %d, c2 = %d \n",
|
fprintf(stderr, " (%4d x%4d x%4d) + (%4d x%4d x%4d) \n",
|
||||||
params.w, net.layers[index].out_w, params.h, net.layers[index].out_h, params.c, params.net.layers[index].out_c);
|
params.w, params.h, params.c, net.layers[index].out_w, net.layers[index].out_h, params.net.layers[index].out_c);
|
||||||
}
|
}
|
||||||
|
|
||||||
return s;
|
return s;
|
||||||
|
|
|
@ -196,12 +196,12 @@ void forward_shortcut_layer_gpu(const layer l, network_state state)
|
||||||
|
|
||||||
//-----------
|
//-----------
|
||||||
//if (l.outputs == l.input_sizes[0])
|
//if (l.outputs == l.input_sizes[0])
|
||||||
//if(l.n == 1)
|
if(l.n == 1 && l.nweights == 0)
|
||||||
//{
|
{
|
||||||
// input_shortcut_gpu(state.input, l.batch, state.net.layers[l.index].w, state.net.layers[l.index].h, state.net.layers[l.index].c,
|
input_shortcut_gpu(state.input, l.batch, state.net.layers[l.index].w, state.net.layers[l.index].h, state.net.layers[l.index].c,
|
||||||
// state.net.layers[l.index].output_gpu, l.out_w, l.out_h, l.out_c, l.output_gpu);
|
state.net.layers[l.index].output_gpu, l.out_w, l.out_h, l.out_c, l.output_gpu);
|
||||||
//}
|
}
|
||||||
//else
|
else
|
||||||
{
|
{
|
||||||
shortcut_multilayer_gpu(l.outputs, l.batch, l.n, l.input_sizes_gpu, l.layers_output_gpu, l.output_gpu, state.input, l.weights_gpu, l.nweights);
|
shortcut_multilayer_gpu(l.outputs, l.batch, l.n, l.input_sizes_gpu, l.layers_output_gpu, l.output_gpu, state.input, l.weights_gpu, l.nweights);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue