diff --git a/dlib/dnn/core.h b/dlib/dnn/core.h
index 91ea66580..37b188a23 100644
--- a/dlib/dnn/core.h
+++ b/dlib/dnn/core.h
@@ -365,7 +365,6 @@ namespace dlib
         {
             dimpl::subnet_wrapper<subnet_type> wsub(subnetwork);
             params_grad.copy_size(details.get_layer_params());
-            params_grad = 0;
             details.backward(get_output(), get_gradient_input(), wsub, static_cast<tensor&>(params_grad));
             // Don't try to adjust the parameters if this layer doesn't have any.
             if (params_grad.size() != 0)
@@ -601,7 +600,6 @@ namespace dlib
         {
             subnet_wrapper wsub(x, grad_final_ignored);
             params_grad.copy_size(details.get_layer_params());
-            params_grad = 0;
             details.backward(get_output(), get_gradient_input(), wsub, static_cast<tensor&>(params_grad));
             // Don't try to adjust the parameters if this layer doesn't have any.
             if (params_grad.size() != 0)
@@ -1605,11 +1603,11 @@ namespace dlib
         // Now tell the layer to compute all the gradients.  In the rest of this function
         // we will just be checking that these gradients were computed correctly by
         // comparing them to a central differences approximation.
-        resizable_tensor params_grad, random_noise;
+        resizable_tensor params_grad;
         params_grad.copy_size(l.get_layer_params());
-        random_noise.copy_size(l.get_layer_params());
-        randomize_parameters(random_noise, 5, rnd);
-        params_grad = random_noise;
+        // Set the params grad to something crazy so that it's very obvious if it doesn't
+        // get fully assigned.
+        params_grad = std::numeric_limits<float>::infinity();
         l.backward(output, input_grad, subnetwork, params_grad);
 
 
@@ -1631,7 +1629,7 @@ namespace dlib
             // Compute a reference derivative via a central differences approximation and
             // compare it to the one output by the layer and make sure they match.
             double reference_derivative = (dot(out2,input_grad)-dot(out3, input_grad))/(2*eps);
-            double output_derivative = params_grad.host()[i]-random_noise.host()[i];
+            double output_derivative = params_grad.host()[i];
             double relative_error = (reference_derivative - output_derivative)/(reference_derivative + 1e-100);
             if (std::abs(relative_error) > 0.01)
             {
diff --git a/dlib/dnn/cudnn_dlibapi.cpp b/dlib/dnn/cudnn_dlibapi.cpp
index bdb44b3d0..1d0c2b710 100644
--- a/dlib/dnn/cudnn_dlibapi.cpp
+++ b/dlib/dnn/cudnn_dlibapi.cpp
@@ -221,7 +221,7 @@ namespace dlib
                   gradient_input.size() > 0,"");
 
             const float alpha = 1;
-            const float beta = 1;
+            const float beta = 0;
             check(cudnnConvolutionBackwardBias(context(),
                                                &alpha,
                                                descriptor(gradient_input),
@@ -483,7 +483,7 @@ namespace dlib
         )
         {
             const float alpha = 1;
-            const float beta = 1;
+            const float beta = 0;
             check(cudnnConvolutionBackwardFilter_v3(context(),
                                                     &alpha,
                                                     descriptor(data),
diff --git a/dlib/dnn/cudnn_dlibapi.h b/dlib/dnn/cudnn_dlibapi.h
index 6722e85ab..c605cac4c 100644
--- a/dlib/dnn/cudnn_dlibapi.h
+++ b/dlib/dnn/cudnn_dlibapi.h
@@ -128,7 +128,7 @@ namespace dlib
                 - let OUT be the output of add(1,OUT,1,BIAS)
                 - let f(gradient_input,BIAS) == dot(gradient_input,OUT)
                 - Then this function computes the gradient of f() with respect to BIAS and
-                  adds it to grad.
+                  assigns it to grad.
         !*/
 
     // ------------------------------------------------------------------------------------
@@ -219,7 +219,7 @@ namespace dlib
                     - let OUT be the output of (*this)(OUT,data,filters).
                     - let f(data,filters) == dot(OUT, gradient_input)
                     - This function finds the gradient of f() with respect to filters 
-                      and adds this gradient to filters_gradient.
+                      and assigns this gradient to filters_gradient.
             !*/
 
         private:
diff --git a/dlib/dnn/layers.h b/dlib/dnn/layers.h
index d23adb1a7..4a5e4fcbd 100644
--- a/dlib/dnn/layers.h
+++ b/dlib/dnn/layers.h
@@ -92,7 +92,7 @@ namespace dlib
         void backward(const tensor& , const tensor& gradient_input, SUBNET& sub, tensor& params_grad)
         {
             // compute the gradient of the parameters.  
-            params_grad += trans(mat(sub.get_output()))*mat(gradient_input);
+            params_grad = trans(mat(sub.get_output()))*mat(gradient_input);
 
             // compute the gradient for the data
             sub.get_gradient_input() += mat(gradient_input)*trans(mat(params));
@@ -161,7 +161,9 @@ namespace dlib
             for (unsigned long i = 0; i < sub.get_output().size(); ++i)
             {
                 if (in[i] > 0)
-                    out[i] += grad[i];
+                    out[i] = grad[i];
+                else
+                    out[i] = 0;
             }
 
         }
diff --git a/dlib/dnn/layers_abstract.h b/dlib/dnn/layers_abstract.h
index 3b0062e02..7c073227e 100644
--- a/dlib/dnn/layers_abstract.h
+++ b/dlib/dnn/layers_abstract.h
@@ -201,8 +201,8 @@ namespace dlib
                           draw inputs from the immediate sub layer, sub.subnet(), or
                           any earlier layer.  So you must consider the gradients with
                           respect to all inputs drawn from sub)
-                  Finally, backward() adds these gradients into the output by performing:
-                    - params_grad += PARAMETER_GRADIENT
+                  Finally, backward() outputs these gradients by performing:
+                    - params_grad = PARAMETER_GRADIENT 
                     - for all valid I:
                         - layer<I>(sub).get_gradient_input() += DATA_GRADIENT_I
         !*/