diff --git a/dlib/dnn/cpu_dlib.cpp b/dlib/dnn/cpu_dlib.cpp
index 6ba2ac4da..3735d7d48 100644
--- a/dlib/dnn/cpu_dlib.cpp
+++ b/dlib/dnn/cpu_dlib.cpp
@@ -2023,6 +2023,7 @@ namespace dlib
      // ------------------------------------------------------------------------------------
 
         void copy_tensor(
+            bool add_to,
             tensor& dest,
             size_t dest_k_offset,
             const tensor& src,
@@ -2045,7 +2046,15 @@ namespace dlib
 
             for (long i = 0; i < src.num_samples(); ++i)
             {
-                ::memcpy(dest_p, src_p, block_size * sizeof(float));
+                if (add_to)
+                {
+                    for (size_t j = 0; j < block_size; ++j)
+                        dest_p[j] += src_p[j];
+                }
+                else
+                {
+                    ::memcpy(dest_p, src_p, block_size * sizeof(float));
+                }
 
                 dest_p += dest_sample_size;
                 src_p  += src_sample_size;
diff --git a/dlib/dnn/cpu_dlib.h b/dlib/dnn/cpu_dlib.h
index df8d20002..c117d15ab 100644
--- a/dlib/dnn/cpu_dlib.h
+++ b/dlib/dnn/cpu_dlib.h
@@ -445,6 +445,7 @@ namespace dlib
     // -----------------------------------------------------------------------------------
 
         void copy_tensor(
+            bool add_to,
             tensor& dest,
             size_t dest_k_offset,
             const tensor& src,
diff --git a/dlib/dnn/cuda_dlib.cu b/dlib/dnn/cuda_dlib.cu
index 2203e73a1..55b2cba5e 100644
--- a/dlib/dnn/cuda_dlib.cu
+++ b/dlib/dnn/cuda_dlib.cu
@@ -1377,12 +1377,33 @@ namespace dlib
 
     // ----------------------------------------------------------------------------------------
 
+        __global__ void _cuda_copy_tensor_add_to (float* dest, size_t size,  const float* src,  size_t dest_stride, size_t src_stride, size_t block_size)
+        {
+            for(auto i : grid_stride_range(0, size)) 
+            {
+                size_t blk = i/block_size;
+                size_t j = i%block_size;
+                dest[blk*dest_stride + j] += src[blk*src_stride + j];
+            }
+        }
+
+        __global__ void _cuda_copy_tensor (float* dest, size_t size,  const float* src,  size_t dest_stride, size_t src_stride, size_t block_size)
+        {
+            for(auto i : grid_stride_range(0, size)) 
+            {
+                size_t blk = i/block_size;
+                size_t j = i%block_size;
+                dest[blk*dest_stride + j] = src[blk*src_stride + j];
+            }
+        }
+
         void copy_tensor(
-                tensor& dest,
-                size_t dest_k_offset,
-                const tensor& src,
-                size_t src_k_offset,
-                size_t count_k
+            bool add_to,
+            tensor& dest,
+            size_t dest_k_offset,
+            const tensor& src,
+            size_t src_k_offset,
+            size_t count_k
         )
         {
             const size_t dest_sample_size = static_cast<size_t>(dest.nc() * dest.nr() * dest.k());
@@ -1398,13 +1419,17 @@ namespace dlib
             float* dest_p = dest.device() + dest_k_offset * dest.nc() * dest.nr();
             const float* src_p = src.device() + src_k_offset * src.nc() * src.nr();;
 
-
-            for (long i = 0; i < src.num_samples(); ++i)
+            if (add_to)
             {
-                CHECK_CUDA(cudaMemcpyAsync(dest_p, src_p, block_size * sizeof(float), cudaMemcpyDeviceToDevice));
-
-                dest_p += dest_sample_size;
-                src_p  += src_sample_size;
+                launch_kernel(_cuda_copy_tensor_add_to, max_jobs(dest.size()), 
+                              dest_p, block_size*dest.num_samples(),
+                              src_p, dest_sample_size, src_sample_size, block_size);
+            }
+            else
+            {
+                launch_kernel(_cuda_copy_tensor, max_jobs(dest.size()), 
+                              dest_p, block_size*dest.num_samples(),
+                              src_p, dest_sample_size, src_sample_size, block_size);
             }
         }
 
diff --git a/dlib/dnn/cuda_dlib.h b/dlib/dnn/cuda_dlib.h
index dbf1bbd69..28bed27b0 100644
--- a/dlib/dnn/cuda_dlib.h
+++ b/dlib/dnn/cuda_dlib.h
@@ -369,6 +369,7 @@ namespace dlib
     // ----------------------------------------------------------------------------------------
 
         void copy_tensor(
+            bool add_to,
             tensor& dest,
             size_t dest_k_offset,
             const tensor& src,
diff --git a/dlib/dnn/layers.h b/dlib/dnn/layers.h
index dff836d50..be147a0a0 100644
--- a/dlib/dnn/layers.h
+++ b/dlib/dnn/layers.h
@@ -2604,7 +2604,7 @@ namespace dlib
             static void concat(tensor& out, const SUBNET& sub, size_t k_offset)
             {
                 auto& t = layer<TAG_TYPE>(sub).get_output();
-                tt::copy_tensor(out, k_offset, t, 0, t.k());
+                tt::copy_tensor(false, out, k_offset, t, 0, t.k());
                 k_offset += t.k();
                 concat_helper_impl<TAG_TYPES...>::concat(out, sub, k_offset);
             }
@@ -2612,7 +2612,7 @@ namespace dlib
             static void split(const tensor& input, SUBNET& sub, size_t k_offset)
             {
                 auto& t = layer<TAG_TYPE>(sub).get_gradient_input();
-                tt::copy_tensor(t, 0, input, k_offset, t.k());
+                tt::copy_tensor(true, t, 0, input, k_offset, t.k());
                 k_offset += t.k();
                 concat_helper_impl<TAG_TYPES...>::split(input, sub, k_offset);
             }
@@ -2635,13 +2635,13 @@ namespace dlib
             static void concat(tensor& out, const SUBNET& sub, size_t k_offset)
             {
                 auto& t = layer<TAG_TYPE>(sub).get_output();
-                tt::copy_tensor(out, k_offset, t, 0, t.k());
+                tt::copy_tensor(false, out, k_offset, t, 0, t.k());
             }
             template<typename SUBNET>
             static void split(const tensor& input, SUBNET& sub, size_t k_offset)
             {
                 auto& t = layer<TAG_TYPE>(sub).get_gradient_input();
-                tt::copy_tensor(t, 0, input, k_offset, t.k());
+                tt::copy_tensor(true, t, 0, input, k_offset, t.k());
             }
         };
     }
diff --git a/dlib/dnn/tensor_tools.cpp b/dlib/dnn/tensor_tools.cpp
index 7fc016fd1..eea38e679 100644
--- a/dlib/dnn/tensor_tools.cpp
+++ b/dlib/dnn/tensor_tools.cpp
@@ -881,6 +881,7 @@ namespace dlib { namespace tt
 // ------------------------------------------------------------------------------------
 
     void copy_tensor(
+            bool add_to,
             tensor& dest,
             size_t dest_k_offset,
             const tensor& src,
@@ -889,9 +890,9 @@ namespace dlib { namespace tt
     )
     {
 #ifdef DLIB_USE_CUDA
-        cuda::copy_tensor(dest, dest_k_offset, src, src_k_offset, count_k);
+        cuda::copy_tensor(add_to, dest, dest_k_offset, src, src_k_offset, count_k);
 #else
-        cpu::copy_tensor(dest, dest_k_offset, src, src_k_offset, count_k);
+        cpu::copy_tensor(add_to, dest, dest_k_offset, src, src_k_offset, count_k);
 #endif
     }
 
diff --git a/dlib/dnn/tensor_tools.h b/dlib/dnn/tensor_tools.h
index 47e88606d..8b70501e2 100644
--- a/dlib/dnn/tensor_tools.h
+++ b/dlib/dnn/tensor_tools.h
@@ -1544,6 +1544,7 @@ namespace dlib { namespace tt
 // ----------------------------------------------------------------------------------------
 
     void copy_tensor(
+            bool add_to,
             tensor& dest,
             size_t dest_k_offset,
             const tensor& src,
@@ -1560,8 +1561,12 @@ namespace dlib { namespace tt
             - is_same_object(dest,src) == false
             - The memory areas of src and dest do not overlap.
         ensures
-            - performs: dest[i, k + dest_k_offset, r, c] = src[i, k + src_k_offset, r, c], where k in [0..count_k]
-              Copies content of each sample from src in to corresponding place of sample at dest.
+            - if (add_to) then
+                - performs: dest[i, k + dest_k_offset, r, c] += src[i, k + src_k_offset, r, c], where k in [0..count_k]
+                  i.e., adds content of each sample from src in to corresponding place of sample at dest.
+            - else
+                - performs: dest[i, k + dest_k_offset, r, c]  = src[i, k + src_k_offset, r, c], where k in [0..count_k]
+                  i.e., copies content of each sample from src in to corresponding place of sample at dest.
     !*/
 
 // ----------------------------------------------------------------------------------------
diff --git a/dlib/test/dnn.cpp b/dlib/test/dnn.cpp
index 61030bf7e..e0c88d657 100644
--- a/dlib/test/dnn.cpp
+++ b/dlib/test/dnn.cpp
@@ -1794,9 +1794,9 @@ namespace
         rnd.fill_gaussian(src2);
         rnd.fill_gaussian(src3);
 
-        cpu::copy_tensor(dest, 0, src1, 0,  src1.k()); //full copy src1->dest
-        cpu::copy_tensor(dest, src1.k(), src2, 0,  src2.k()); //full copy src2->dest with offset of src1
-        cpu::copy_tensor(dest, src1.k() + src2.k(), src3, 3,  3); //partial copy src3 into the rest place of dest
+        cpu::copy_tensor(false, dest, 0, src1, 0,  src1.k()); //full copy src1->dest
+        cpu::copy_tensor(false, dest, src1.k(), src2, 0,  src2.k()); //full copy src2->dest with offset of src1
+        cpu::copy_tensor(false, dest, src1.k() + src2.k(), src3, 3,  3); //partial copy src3 into the rest place of dest
 
 
         for (long i = 0; i < dest.num_samples(); ++i)
@@ -1845,9 +1845,9 @@ namespace
         rnd.fill_gaussian(src1);
         rnd.fill_gaussian(src2);
         rnd.fill_gaussian(src3);
-        cuda::copy_tensor(dest, 0, src1, 0,  src1.k()); //full copy src1->dest
-        cuda::copy_tensor(dest, src1.k(), src2, 0,  src2.k()); //full copy src2->dest with offset of src1
-        cuda::copy_tensor(dest, src1.k() + src2.k(), src3, 3,  3); //partial copy src3 into the rest place of dest
+        cuda::copy_tensor(false, dest, 0, src1, 0,  src1.k()); //full copy src1->dest
+        cuda::copy_tensor(false, dest, src1.k(), src2, 0,  src2.k()); //full copy src2->dest with offset of src1
+        cuda::copy_tensor(false, dest, src1.k() + src2.k(), src3, 3,  3); //partial copy src3 into the rest place of dest
 
 
         for (long i = 0; i < dest.num_samples(); ++i)
@@ -1910,9 +1910,9 @@ namespace
         auto& b3o = layer<itag3>(net).get_output();
 
         resizable_tensor dest(10, 14, 111, 222);
-        copy_tensor(dest, 0, b1o, 0,  b1o.k());
-        copy_tensor(dest, b1o.k(), b2o, 0,  b2o.k());
-        copy_tensor(dest, b1o.k() + b2o.k(), b3o, 0,  b3o.k());
+        copy_tensor(false, dest, 0, b1o, 0,  b1o.k());
+        copy_tensor(false, dest, b1o.k(), b2o, 0,  b2o.k());
+        copy_tensor(false, dest, b1o.k() + b2o.k(), b3o, 0,  b3o.k());
 
         DLIB_TEST(dest.size() == out.size());
         int error = memcmp(dest.host(), out.host(), dest.size());
@@ -1932,9 +1932,9 @@ namespace
         resizable_tensor g2(10, 8, 111, 222);
         resizable_tensor g3(10, 1, 111, 222);
 
-        copy_tensor(g1, 0, gr, 0,  g1.k());
-        copy_tensor(g2, 0, gr, g1.k(), g2.k());
-        copy_tensor(g3, 0, gr, g1.k() + g2.k(), g3.k());
+        copy_tensor(false, g1, 0, gr, 0,  g1.k());
+        copy_tensor(false, g2, 0, gr, g1.k(), g2.k());
+        copy_tensor(false, g3, 0, gr, g1.k() + g2.k(), g3.k());
         DLIB_TEST(g1.size() == b1g.size());
         error = memcmp(g1.host(), b1g.host(), b1g.size());
         DLIB_TEST(error == 0);