diff --git a/src/blas.c b/src/blas.c index 6d0c0cdd..9badc55f 100644 --- a/src/blas.c +++ b/src/blas.c @@ -79,7 +79,7 @@ void shortcut_multilayer_cpu(int size, int src_outputs, int batch, int n, int *o // nweights - l.n or l.n*l.c or (l.n*l.c*l.h*l.w) const int layer_step = nweights / (n + 1); // 1 or l.c or (l.c * l.h * l.w) int step = 0; - if (weights) step = src_outputs / layer_step; // (l.c * l.h * l.w) or (l.w*l.h) or 1 + if (nweights > 0) step = src_outputs / layer_step; // (l.c * l.h * l.w) or (l.w*l.h) or 1 int id; #pragma omp parallel for @@ -148,7 +148,7 @@ void backward_shortcut_multilayer_cpu(int size, int src_outputs, int batch, int // nweights - l.n or l.n*l.c or (l.n*l.c*l.h*l.w) const int layer_step = nweights / (n + 1); // 1 or l.c or (l.c * l.h * l.w) int step = 0; - if (weights) step = src_outputs / layer_step; // (l.c * l.h * l.w) or (l.w*l.h) or 1 + if (nweights > 0) step = src_outputs / layer_step; // (l.c * l.h * l.w) or (l.w*l.h) or 1 int id; #pragma omp parallel for diff --git a/src/blas_kernels.cu b/src/blas_kernels.cu index a2fab184..18f2d95f 100644 --- a/src/blas_kernels.cu +++ b/src/blas_kernels.cu @@ -687,7 +687,7 @@ __global__ void shortcut_multilayer_kernel(int size, int src_outputs, int batch, // nweights - l.n or l.n*l.c or (l.n*l.c*l.h*l.w) const int layer_step = nweights / (n + 1); // 1 or l.c or (l.c * l.h * l.w) int step = 0; - if (weights_gpu) step = src_outputs / layer_step; // (l.c * l.h * l.w) or (l.w*l.h) or 1 + if (nweights > 0) step = src_outputs / layer_step; // (l.c * l.h * l.w) or (l.w*l.h) or 1 int src_id = id; const int src_i = src_id % src_outputs; @@ -762,7 +762,7 @@ __global__ void backward_shortcut_multilayer_kernel(int size, int src_outputs, i // nweights - l.n or l.n*l.c or (l.n*l.c*l.h*l.w) const int layer_step = nweights / (n + 1); // 1 or l.c or (l.c * l.h * l.w) int step = 0; - if (weights_gpu) step = src_outputs / layer_step; // (l.c * l.h * l.w) or (l.w*l.h) or 1 + if (nweights > 0) step = src_outputs / layer_step; // (l.c * l.h * l.w) or (l.w*l.h) or 1 int src_id = id; const int src_i = src_id % src_outputs; @@ -836,7 +836,8 @@ extern "C" void backward_shortcut_multilayer_gpu(int src_outputs, int batch, int float **layers_delta_gpu, float *delta_out, float *delta_in, float *weights_gpu, float *weight_updates_gpu, int nweights, float *in, float **layers_output_gpu, WEIGHTS_NORMALIZATION_T weights_normalizion) { const int layer_step = nweights / (n + 1); // 1 or l.c or (l.c * l.h * l.w) - const int step = src_outputs / layer_step; // (l.c * l.h * l.w) or (l.w*l.h) or 1 + int step = 0; + if (nweights > 0) step = src_outputs / layer_step; // (l.c * l.h * l.w) or (l.w*l.h) or 1 //printf(" nweights = %d, n = %d, layer_step = %d, step = %d \n", nweights, n, layer_step, step); //printf(" src_outputs = %d, batch = %d, n = %d \n", src_outputs, batch, n); diff --git a/src/darknet.c b/src/darknet.c index ab5931da..2b0d6a6d 100644 --- a/src/darknet.c +++ b/src/darknet.c @@ -466,6 +466,8 @@ int main(int argc, char **argv) show_opencv_info(); + init_cpu(); + if (0 == strcmp(argv[1], "average")){ average(argc, argv); } else if (0 == strcmp(argv[1], "yolo")){ diff --git a/src/parser.c b/src/parser.c index 291be389..0e530e48 100644 --- a/src/parser.c +++ b/src/parser.c @@ -1544,7 +1544,6 @@ void save_shortcut_weights(layer l, FILE *fp) #endif int num = l.nweights; fwrite(l.weights, sizeof(float), num, fp); - } void save_convolutional_weights(layer l, FILE *fp) @@ -1822,10 +1821,6 @@ void load_convolutional_weights(layer l, FILE *fp) void load_shortcut_weights(layer l, FILE *fp) { - if (l.binary) { - //load_convolutional_weights_binary(l, fp); - //return; - } int num = l.nweights; int read_bytes; read_bytes = fread(l.weights, sizeof(float), num, fp); diff --git a/src/shortcut_layer.c b/src/shortcut_layer.c index 3d0d0098..6ca14a60 100644 --- a/src/shortcut_layer.c +++ b/src/shortcut_layer.c @@ -186,14 +186,16 @@ void backward_shortcut_layer(const layer l, network_state state) void update_shortcut_layer(layer l, int batch, float learning_rate_init, float momentum, float decay) { - float learning_rate = learning_rate_init*l.learning_rate_scale; - //float momentum = a.momentum; - //float decay = a.decay; - //int batch = a.batch; + if (l.nweights > 0) { + float learning_rate = learning_rate_init*l.learning_rate_scale; + //float momentum = a.momentum; + //float decay = a.decay; + //int batch = a.batch; - axpy_cpu(l.nweights, -decay*batch, l.weights, 1, l.weight_updates, 1); - axpy_cpu(l.nweights, learning_rate / batch, l.weight_updates, 1, l.weights, 1); - scal_cpu(l.nweights, momentum, l.weight_updates, 1); + axpy_cpu(l.nweights, -decay*batch, l.weights, 1, l.weight_updates, 1); + axpy_cpu(l.nweights, learning_rate / batch, l.weight_updates, 1, l.weights, 1); + scal_cpu(l.nweights, momentum, l.weight_updates, 1); + } } #ifdef GPU @@ -238,28 +240,30 @@ void backward_shortcut_layer_gpu(const layer l, network_state state) void update_shortcut_layer_gpu(layer l, int batch, float learning_rate_init, float momentum, float decay) { - float learning_rate = learning_rate_init*l.learning_rate_scale; - //float momentum = a.momentum; - //float decay = a.decay; - //int batch = a.batch; + if (l.nweights > 0) { + float learning_rate = learning_rate_init*l.learning_rate_scale; + //float momentum = a.momentum; + //float decay = a.decay; + //int batch = a.batch; - fix_nan_and_inf(l.weight_updates_gpu, l.nweights); - fix_nan_and_inf(l.weights_gpu, l.nweights); + fix_nan_and_inf(l.weight_updates_gpu, l.nweights); + fix_nan_and_inf(l.weights_gpu, l.nweights); - axpy_ongpu(l.nweights, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1); - axpy_ongpu(l.nweights, learning_rate / batch, l.weight_updates_gpu, 1, l.weights_gpu, 1); - scal_ongpu(l.nweights, momentum, l.weight_updates_gpu, 1); + axpy_ongpu(l.nweights, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1); + axpy_ongpu(l.nweights, learning_rate / batch, l.weight_updates_gpu, 1, l.weights_gpu, 1); + scal_ongpu(l.nweights, momentum, l.weight_updates_gpu, 1); - //if (l.clip) { - // constrain_gpu(l.nweights, l.clip, l.weights_gpu, 1); - //} + //if (l.clip) { + // constrain_gpu(l.nweights, l.clip, l.weights_gpu, 1); + //} + } } void pull_shortcut_layer(layer l) { cuda_pull_array_async(l.weights_gpu, l.weights, l.nweights); CHECK_CUDA(cudaPeekAtLastError()); - cudaStreamSynchronize(get_cuda_stream()); + CHECK_CUDA(cudaStreamSynchronize(get_cuda_stream())); } void push_shortcut_layer(layer l)