diff --git a/dlib/dnn/cpu_dlib.cpp b/dlib/dnn/cpu_dlib.cpp index 6ba2ac4da..3735d7d48 100644 --- a/dlib/dnn/cpu_dlib.cpp +++ b/dlib/dnn/cpu_dlib.cpp @@ -2023,6 +2023,7 @@ namespace dlib // ------------------------------------------------------------------------------------ void copy_tensor( + bool add_to, tensor& dest, size_t dest_k_offset, const tensor& src, @@ -2045,7 +2046,15 @@ namespace dlib for (long i = 0; i < src.num_samples(); ++i) { - ::memcpy(dest_p, src_p, block_size * sizeof(float)); + if (add_to) + { + for (size_t j = 0; j < block_size; ++j) + dest_p[j] += src_p[j]; + } + else + { + ::memcpy(dest_p, src_p, block_size * sizeof(float)); + } dest_p += dest_sample_size; src_p += src_sample_size; diff --git a/dlib/dnn/cpu_dlib.h b/dlib/dnn/cpu_dlib.h index df8d20002..c117d15ab 100644 --- a/dlib/dnn/cpu_dlib.h +++ b/dlib/dnn/cpu_dlib.h @@ -445,6 +445,7 @@ namespace dlib // ----------------------------------------------------------------------------------- void copy_tensor( + bool add_to, tensor& dest, size_t dest_k_offset, const tensor& src, diff --git a/dlib/dnn/cuda_dlib.cu b/dlib/dnn/cuda_dlib.cu index 2203e73a1..55b2cba5e 100644 --- a/dlib/dnn/cuda_dlib.cu +++ b/dlib/dnn/cuda_dlib.cu @@ -1377,12 +1377,33 @@ namespace dlib // ---------------------------------------------------------------------------------------- + __global__ void _cuda_copy_tensor_add_to (float* dest, size_t size, const float* src, size_t dest_stride, size_t src_stride, size_t block_size) + { + for(auto i : grid_stride_range(0, size)) + { + size_t blk = i/block_size; + size_t j = i%block_size; + dest[blk*dest_stride + j] += src[blk*src_stride + j]; + } + } + + __global__ void _cuda_copy_tensor (float* dest, size_t size, const float* src, size_t dest_stride, size_t src_stride, size_t block_size) + { + for(auto i : grid_stride_range(0, size)) + { + size_t blk = i/block_size; + size_t j = i%block_size; + dest[blk*dest_stride + j] = src[blk*src_stride + j]; + } + } + void copy_tensor( - tensor& dest, - size_t dest_k_offset, - const tensor& src, - size_t src_k_offset, - size_t count_k + bool add_to, + tensor& dest, + size_t dest_k_offset, + const tensor& src, + size_t src_k_offset, + size_t count_k ) { const size_t dest_sample_size = static_cast(dest.nc() * dest.nr() * dest.k()); @@ -1398,13 +1419,17 @@ namespace dlib float* dest_p = dest.device() + dest_k_offset * dest.nc() * dest.nr(); const float* src_p = src.device() + src_k_offset * src.nc() * src.nr();; - - for (long i = 0; i < src.num_samples(); ++i) + if (add_to) { - CHECK_CUDA(cudaMemcpyAsync(dest_p, src_p, block_size * sizeof(float), cudaMemcpyDeviceToDevice)); - - dest_p += dest_sample_size; - src_p += src_sample_size; + launch_kernel(_cuda_copy_tensor_add_to, max_jobs(dest.size()), + dest_p, block_size*dest.num_samples(), + src_p, dest_sample_size, src_sample_size, block_size); + } + else + { + launch_kernel(_cuda_copy_tensor, max_jobs(dest.size()), + dest_p, block_size*dest.num_samples(), + src_p, dest_sample_size, src_sample_size, block_size); } } diff --git a/dlib/dnn/cuda_dlib.h b/dlib/dnn/cuda_dlib.h index dbf1bbd69..28bed27b0 100644 --- a/dlib/dnn/cuda_dlib.h +++ b/dlib/dnn/cuda_dlib.h @@ -369,6 +369,7 @@ namespace dlib // ---------------------------------------------------------------------------------------- void copy_tensor( + bool add_to, tensor& dest, size_t dest_k_offset, const tensor& src, diff --git a/dlib/dnn/layers.h b/dlib/dnn/layers.h index dff836d50..be147a0a0 100644 --- a/dlib/dnn/layers.h +++ b/dlib/dnn/layers.h @@ -2604,7 +2604,7 @@ namespace dlib static void concat(tensor& out, const SUBNET& sub, size_t k_offset) { auto& t = layer(sub).get_output(); - tt::copy_tensor(out, k_offset, t, 0, t.k()); + tt::copy_tensor(false, out, k_offset, t, 0, t.k()); k_offset += t.k(); concat_helper_impl::concat(out, sub, k_offset); } @@ -2612,7 +2612,7 @@ namespace dlib static void split(const tensor& input, SUBNET& sub, size_t k_offset) { auto& t = layer(sub).get_gradient_input(); - tt::copy_tensor(t, 0, input, k_offset, t.k()); + tt::copy_tensor(true, t, 0, input, k_offset, t.k()); k_offset += t.k(); concat_helper_impl::split(input, sub, k_offset); } @@ -2635,13 +2635,13 @@ namespace dlib static void concat(tensor& out, const SUBNET& sub, size_t k_offset) { auto& t = layer(sub).get_output(); - tt::copy_tensor(out, k_offset, t, 0, t.k()); + tt::copy_tensor(false, out, k_offset, t, 0, t.k()); } template static void split(const tensor& input, SUBNET& sub, size_t k_offset) { auto& t = layer(sub).get_gradient_input(); - tt::copy_tensor(t, 0, input, k_offset, t.k()); + tt::copy_tensor(true, t, 0, input, k_offset, t.k()); } }; } diff --git a/dlib/dnn/tensor_tools.cpp b/dlib/dnn/tensor_tools.cpp index 7fc016fd1..eea38e679 100644 --- a/dlib/dnn/tensor_tools.cpp +++ b/dlib/dnn/tensor_tools.cpp @@ -881,6 +881,7 @@ namespace dlib { namespace tt // ------------------------------------------------------------------------------------ void copy_tensor( + bool add_to, tensor& dest, size_t dest_k_offset, const tensor& src, @@ -889,9 +890,9 @@ namespace dlib { namespace tt ) { #ifdef DLIB_USE_CUDA - cuda::copy_tensor(dest, dest_k_offset, src, src_k_offset, count_k); + cuda::copy_tensor(add_to, dest, dest_k_offset, src, src_k_offset, count_k); #else - cpu::copy_tensor(dest, dest_k_offset, src, src_k_offset, count_k); + cpu::copy_tensor(add_to, dest, dest_k_offset, src, src_k_offset, count_k); #endif } diff --git a/dlib/dnn/tensor_tools.h b/dlib/dnn/tensor_tools.h index 47e88606d..8b70501e2 100644 --- a/dlib/dnn/tensor_tools.h +++ b/dlib/dnn/tensor_tools.h @@ -1544,6 +1544,7 @@ namespace dlib { namespace tt // ---------------------------------------------------------------------------------------- void copy_tensor( + bool add_to, tensor& dest, size_t dest_k_offset, const tensor& src, @@ -1560,8 +1561,12 @@ namespace dlib { namespace tt - is_same_object(dest,src) == false - The memory areas of src and dest do not overlap. ensures - - performs: dest[i, k + dest_k_offset, r, c] = src[i, k + src_k_offset, r, c], where k in [0..count_k] - Copies content of each sample from src in to corresponding place of sample at dest. + - if (add_to) then + - performs: dest[i, k + dest_k_offset, r, c] += src[i, k + src_k_offset, r, c], where k in [0..count_k] + i.e., adds content of each sample from src in to corresponding place of sample at dest. + - else + - performs: dest[i, k + dest_k_offset, r, c] = src[i, k + src_k_offset, r, c], where k in [0..count_k] + i.e., copies content of each sample from src in to corresponding place of sample at dest. !*/ // ---------------------------------------------------------------------------------------- diff --git a/dlib/test/dnn.cpp b/dlib/test/dnn.cpp index 61030bf7e..e0c88d657 100644 --- a/dlib/test/dnn.cpp +++ b/dlib/test/dnn.cpp @@ -1794,9 +1794,9 @@ namespace rnd.fill_gaussian(src2); rnd.fill_gaussian(src3); - cpu::copy_tensor(dest, 0, src1, 0, src1.k()); //full copy src1->dest - cpu::copy_tensor(dest, src1.k(), src2, 0, src2.k()); //full copy src2->dest with offset of src1 - cpu::copy_tensor(dest, src1.k() + src2.k(), src3, 3, 3); //partial copy src3 into the rest place of dest + cpu::copy_tensor(false, dest, 0, src1, 0, src1.k()); //full copy src1->dest + cpu::copy_tensor(false, dest, src1.k(), src2, 0, src2.k()); //full copy src2->dest with offset of src1 + cpu::copy_tensor(false, dest, src1.k() + src2.k(), src3, 3, 3); //partial copy src3 into the rest place of dest for (long i = 0; i < dest.num_samples(); ++i) @@ -1845,9 +1845,9 @@ namespace rnd.fill_gaussian(src1); rnd.fill_gaussian(src2); rnd.fill_gaussian(src3); - cuda::copy_tensor(dest, 0, src1, 0, src1.k()); //full copy src1->dest - cuda::copy_tensor(dest, src1.k(), src2, 0, src2.k()); //full copy src2->dest with offset of src1 - cuda::copy_tensor(dest, src1.k() + src2.k(), src3, 3, 3); //partial copy src3 into the rest place of dest + cuda::copy_tensor(false, dest, 0, src1, 0, src1.k()); //full copy src1->dest + cuda::copy_tensor(false, dest, src1.k(), src2, 0, src2.k()); //full copy src2->dest with offset of src1 + cuda::copy_tensor(false, dest, src1.k() + src2.k(), src3, 3, 3); //partial copy src3 into the rest place of dest for (long i = 0; i < dest.num_samples(); ++i) @@ -1910,9 +1910,9 @@ namespace auto& b3o = layer(net).get_output(); resizable_tensor dest(10, 14, 111, 222); - copy_tensor(dest, 0, b1o, 0, b1o.k()); - copy_tensor(dest, b1o.k(), b2o, 0, b2o.k()); - copy_tensor(dest, b1o.k() + b2o.k(), b3o, 0, b3o.k()); + copy_tensor(false, dest, 0, b1o, 0, b1o.k()); + copy_tensor(false, dest, b1o.k(), b2o, 0, b2o.k()); + copy_tensor(false, dest, b1o.k() + b2o.k(), b3o, 0, b3o.k()); DLIB_TEST(dest.size() == out.size()); int error = memcmp(dest.host(), out.host(), dest.size()); @@ -1932,9 +1932,9 @@ namespace resizable_tensor g2(10, 8, 111, 222); resizable_tensor g3(10, 1, 111, 222); - copy_tensor(g1, 0, gr, 0, g1.k()); - copy_tensor(g2, 0, gr, g1.k(), g2.k()); - copy_tensor(g3, 0, gr, g1.k() + g2.k(), g3.k()); + copy_tensor(false, g1, 0, gr, 0, g1.k()); + copy_tensor(false, g2, 0, gr, g1.k(), g2.k()); + copy_tensor(false, g3, 0, gr, g1.k() + g2.k(), g3.k()); DLIB_TEST(g1.size() == b1g.size()); error = memcmp(g1.host(), b1g.host(), b1g.size()); DLIB_TEST(error == 0);