diff --git a/dlib/dnn/core.h b/dlib/dnn/core.h index 91ea66580..37b188a23 100644 --- a/dlib/dnn/core.h +++ b/dlib/dnn/core.h @@ -365,7 +365,6 @@ namespace dlib { dimpl::subnet_wrapper wsub(subnetwork); params_grad.copy_size(details.get_layer_params()); - params_grad = 0; details.backward(get_output(), get_gradient_input(), wsub, static_cast(params_grad)); // Don't try to adjust the parameters if this layer doesn't have any. if (params_grad.size() != 0) @@ -601,7 +600,6 @@ namespace dlib { subnet_wrapper wsub(x, grad_final_ignored); params_grad.copy_size(details.get_layer_params()); - params_grad = 0; details.backward(get_output(), get_gradient_input(), wsub, static_cast(params_grad)); // Don't try to adjust the parameters if this layer doesn't have any. if (params_grad.size() != 0) @@ -1605,11 +1603,11 @@ namespace dlib // Now tell the layer to compute all the gradients. In the rest of this function // we will just be checking that these gradients were computed correctly by // comparing them to a central differences approximation. - resizable_tensor params_grad, random_noise; + resizable_tensor params_grad; params_grad.copy_size(l.get_layer_params()); - random_noise.copy_size(l.get_layer_params()); - randomize_parameters(random_noise, 5, rnd); - params_grad = random_noise; + // Set the params grad to something crazy so that it's very obvious if it doesn't + // get fully assigned. + params_grad = std::numeric_limits::infinity(); l.backward(output, input_grad, subnetwork, params_grad); @@ -1631,7 +1629,7 @@ namespace dlib // Compute a reference derivative via a central differences approximation and // compare it to the one output by the layer and make sure they match. double reference_derivative = (dot(out2,input_grad)-dot(out3, input_grad))/(2*eps); - double output_derivative = params_grad.host()[i]-random_noise.host()[i]; + double output_derivative = params_grad.host()[i]; double relative_error = (reference_derivative - output_derivative)/(reference_derivative + 1e-100); if (std::abs(relative_error) > 0.01) { diff --git a/dlib/dnn/cudnn_dlibapi.cpp b/dlib/dnn/cudnn_dlibapi.cpp index bdb44b3d0..1d0c2b710 100644 --- a/dlib/dnn/cudnn_dlibapi.cpp +++ b/dlib/dnn/cudnn_dlibapi.cpp @@ -221,7 +221,7 @@ namespace dlib gradient_input.size() > 0,""); const float alpha = 1; - const float beta = 1; + const float beta = 0; check(cudnnConvolutionBackwardBias(context(), &alpha, descriptor(gradient_input), @@ -483,7 +483,7 @@ namespace dlib ) { const float alpha = 1; - const float beta = 1; + const float beta = 0; check(cudnnConvolutionBackwardFilter_v3(context(), &alpha, descriptor(data), diff --git a/dlib/dnn/cudnn_dlibapi.h b/dlib/dnn/cudnn_dlibapi.h index 6722e85ab..c605cac4c 100644 --- a/dlib/dnn/cudnn_dlibapi.h +++ b/dlib/dnn/cudnn_dlibapi.h @@ -128,7 +128,7 @@ namespace dlib - let OUT be the output of add(1,OUT,1,BIAS) - let f(gradient_input,BIAS) == dot(gradient_input,OUT) - Then this function computes the gradient of f() with respect to BIAS and - adds it to grad. + assigns it to grad. !*/ // ------------------------------------------------------------------------------------ @@ -219,7 +219,7 @@ namespace dlib - let OUT be the output of (*this)(OUT,data,filters). - let f(data,filters) == dot(OUT, gradient_input) - This function finds the gradient of f() with respect to filters - and adds this gradient to filters_gradient. + and assigns this gradient to filters_gradient. !*/ private: diff --git a/dlib/dnn/layers.h b/dlib/dnn/layers.h index d23adb1a7..4a5e4fcbd 100644 --- a/dlib/dnn/layers.h +++ b/dlib/dnn/layers.h @@ -92,7 +92,7 @@ namespace dlib void backward(const tensor& , const tensor& gradient_input, SUBNET& sub, tensor& params_grad) { // compute the gradient of the parameters. - params_grad += trans(mat(sub.get_output()))*mat(gradient_input); + params_grad = trans(mat(sub.get_output()))*mat(gradient_input); // compute the gradient for the data sub.get_gradient_input() += mat(gradient_input)*trans(mat(params)); @@ -161,7 +161,9 @@ namespace dlib for (unsigned long i = 0; i < sub.get_output().size(); ++i) { if (in[i] > 0) - out[i] += grad[i]; + out[i] = grad[i]; + else + out[i] = 0; } } diff --git a/dlib/dnn/layers_abstract.h b/dlib/dnn/layers_abstract.h index 3b0062e02..7c073227e 100644 --- a/dlib/dnn/layers_abstract.h +++ b/dlib/dnn/layers_abstract.h @@ -201,8 +201,8 @@ namespace dlib draw inputs from the immediate sub layer, sub.subnet(), or any earlier layer. So you must consider the gradients with respect to all inputs drawn from sub) - Finally, backward() adds these gradients into the output by performing: - - params_grad += PARAMETER_GRADIENT + Finally, backward() outputs these gradients by performing: + - params_grad = PARAMETER_GRADIENT - for all valid I: - layer(sub).get_gradient_input() += DATA_GRADIENT_I !*/