mirror of https://github.com/davisking/dlib.git
Removed unnecessary zero initialization of parameter gradients in core.h.
This commit is contained in:
parent
eada4be8e3
commit
e2a2a26a1b
|
@ -365,7 +365,6 @@ namespace dlib
|
|||
{
|
||||
dimpl::subnet_wrapper<subnet_type> wsub(subnetwork);
|
||||
params_grad.copy_size(details.get_layer_params());
|
||||
params_grad = 0;
|
||||
details.backward(get_output(), get_gradient_input(), wsub, static_cast<tensor&>(params_grad));
|
||||
// Don't try to adjust the parameters if this layer doesn't have any.
|
||||
if (params_grad.size() != 0)
|
||||
|
@ -601,7 +600,6 @@ namespace dlib
|
|||
{
|
||||
subnet_wrapper wsub(x, grad_final_ignored);
|
||||
params_grad.copy_size(details.get_layer_params());
|
||||
params_grad = 0;
|
||||
details.backward(get_output(), get_gradient_input(), wsub, static_cast<tensor&>(params_grad));
|
||||
// Don't try to adjust the parameters if this layer doesn't have any.
|
||||
if (params_grad.size() != 0)
|
||||
|
@ -1605,11 +1603,11 @@ namespace dlib
|
|||
// Now tell the layer to compute all the gradients. In the rest of this function
|
||||
// we will just be checking that these gradients were computed correctly by
|
||||
// comparing them to a central differences approximation.
|
||||
resizable_tensor params_grad, random_noise;
|
||||
resizable_tensor params_grad;
|
||||
params_grad.copy_size(l.get_layer_params());
|
||||
random_noise.copy_size(l.get_layer_params());
|
||||
randomize_parameters(random_noise, 5, rnd);
|
||||
params_grad = random_noise;
|
||||
// Set the params grad to something crazy so that it's very obvious if it doesn't
|
||||
// get fully assigned.
|
||||
params_grad = std::numeric_limits<float>::infinity();
|
||||
l.backward(output, input_grad, subnetwork, params_grad);
|
||||
|
||||
|
||||
|
@ -1631,7 +1629,7 @@ namespace dlib
|
|||
// Compute a reference derivative via a central differences approximation and
|
||||
// compare it to the one output by the layer and make sure they match.
|
||||
double reference_derivative = (dot(out2,input_grad)-dot(out3, input_grad))/(2*eps);
|
||||
double output_derivative = params_grad.host()[i]-random_noise.host()[i];
|
||||
double output_derivative = params_grad.host()[i];
|
||||
double relative_error = (reference_derivative - output_derivative)/(reference_derivative + 1e-100);
|
||||
if (std::abs(relative_error) > 0.01)
|
||||
{
|
||||
|
|
|
@ -221,7 +221,7 @@ namespace dlib
|
|||
gradient_input.size() > 0,"");
|
||||
|
||||
const float alpha = 1;
|
||||
const float beta = 1;
|
||||
const float beta = 0;
|
||||
check(cudnnConvolutionBackwardBias(context(),
|
||||
&alpha,
|
||||
descriptor(gradient_input),
|
||||
|
@ -483,7 +483,7 @@ namespace dlib
|
|||
)
|
||||
{
|
||||
const float alpha = 1;
|
||||
const float beta = 1;
|
||||
const float beta = 0;
|
||||
check(cudnnConvolutionBackwardFilter_v3(context(),
|
||||
&alpha,
|
||||
descriptor(data),
|
||||
|
|
|
@ -128,7 +128,7 @@ namespace dlib
|
|||
- let OUT be the output of add(1,OUT,1,BIAS)
|
||||
- let f(gradient_input,BIAS) == dot(gradient_input,OUT)
|
||||
- Then this function computes the gradient of f() with respect to BIAS and
|
||||
adds it to grad.
|
||||
assigns it to grad.
|
||||
!*/
|
||||
|
||||
// ------------------------------------------------------------------------------------
|
||||
|
@ -219,7 +219,7 @@ namespace dlib
|
|||
- let OUT be the output of (*this)(OUT,data,filters).
|
||||
- let f(data,filters) == dot(OUT, gradient_input)
|
||||
- This function finds the gradient of f() with respect to filters
|
||||
and adds this gradient to filters_gradient.
|
||||
and assigns this gradient to filters_gradient.
|
||||
!*/
|
||||
|
||||
private:
|
||||
|
|
|
@ -92,7 +92,7 @@ namespace dlib
|
|||
void backward(const tensor& , const tensor& gradient_input, SUBNET& sub, tensor& params_grad)
|
||||
{
|
||||
// compute the gradient of the parameters.
|
||||
params_grad += trans(mat(sub.get_output()))*mat(gradient_input);
|
||||
params_grad = trans(mat(sub.get_output()))*mat(gradient_input);
|
||||
|
||||
// compute the gradient for the data
|
||||
sub.get_gradient_input() += mat(gradient_input)*trans(mat(params));
|
||||
|
@ -161,7 +161,9 @@ namespace dlib
|
|||
for (unsigned long i = 0; i < sub.get_output().size(); ++i)
|
||||
{
|
||||
if (in[i] > 0)
|
||||
out[i] += grad[i];
|
||||
out[i] = grad[i];
|
||||
else
|
||||
out[i] = 0;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -201,8 +201,8 @@ namespace dlib
|
|||
draw inputs from the immediate sub layer, sub.subnet(), or
|
||||
any earlier layer. So you must consider the gradients with
|
||||
respect to all inputs drawn from sub)
|
||||
Finally, backward() adds these gradients into the output by performing:
|
||||
- params_grad += PARAMETER_GRADIENT
|
||||
Finally, backward() outputs these gradients by performing:
|
||||
- params_grad = PARAMETER_GRADIENT
|
||||
- for all valid I:
|
||||
- layer<I>(sub).get_gradient_input() += DATA_GRADIENT_I
|
||||
!*/
|
||||
|
|
Loading…
Reference in New Issue