mirror of https://github.com/davisking/dlib.git
Added an "add_to" option to tt:copy_tensor(). There was also a bug in the
concat layer's backward() method. It was assigning the gradient to previous layers instead of adding the gradient, as required by the layer interface specification. This change also noticeably speeds up concat layers since only one CUDA kernel launch now happens per concat operation, rather than one kernel launch for each sample in a tensor.
This commit is contained in:
parent
89c9267e46
commit
7078cfaff5
|
@ -2023,6 +2023,7 @@ namespace dlib
|
|||
// ------------------------------------------------------------------------------------
|
||||
|
||||
void copy_tensor(
|
||||
bool add_to,
|
||||
tensor& dest,
|
||||
size_t dest_k_offset,
|
||||
const tensor& src,
|
||||
|
@ -2045,7 +2046,15 @@ namespace dlib
|
|||
|
||||
for (long i = 0; i < src.num_samples(); ++i)
|
||||
{
|
||||
::memcpy(dest_p, src_p, block_size * sizeof(float));
|
||||
if (add_to)
|
||||
{
|
||||
for (size_t j = 0; j < block_size; ++j)
|
||||
dest_p[j] += src_p[j];
|
||||
}
|
||||
else
|
||||
{
|
||||
::memcpy(dest_p, src_p, block_size * sizeof(float));
|
||||
}
|
||||
|
||||
dest_p += dest_sample_size;
|
||||
src_p += src_sample_size;
|
||||
|
|
|
@ -445,6 +445,7 @@ namespace dlib
|
|||
// -----------------------------------------------------------------------------------
|
||||
|
||||
void copy_tensor(
|
||||
bool add_to,
|
||||
tensor& dest,
|
||||
size_t dest_k_offset,
|
||||
const tensor& src,
|
||||
|
|
|
@ -1377,12 +1377,33 @@ namespace dlib
|
|||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
__global__ void _cuda_copy_tensor_add_to (float* dest, size_t size, const float* src, size_t dest_stride, size_t src_stride, size_t block_size)
|
||||
{
|
||||
for(auto i : grid_stride_range(0, size))
|
||||
{
|
||||
size_t blk = i/block_size;
|
||||
size_t j = i%block_size;
|
||||
dest[blk*dest_stride + j] += src[blk*src_stride + j];
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void _cuda_copy_tensor (float* dest, size_t size, const float* src, size_t dest_stride, size_t src_stride, size_t block_size)
|
||||
{
|
||||
for(auto i : grid_stride_range(0, size))
|
||||
{
|
||||
size_t blk = i/block_size;
|
||||
size_t j = i%block_size;
|
||||
dest[blk*dest_stride + j] = src[blk*src_stride + j];
|
||||
}
|
||||
}
|
||||
|
||||
void copy_tensor(
|
||||
tensor& dest,
|
||||
size_t dest_k_offset,
|
||||
const tensor& src,
|
||||
size_t src_k_offset,
|
||||
size_t count_k
|
||||
bool add_to,
|
||||
tensor& dest,
|
||||
size_t dest_k_offset,
|
||||
const tensor& src,
|
||||
size_t src_k_offset,
|
||||
size_t count_k
|
||||
)
|
||||
{
|
||||
const size_t dest_sample_size = static_cast<size_t>(dest.nc() * dest.nr() * dest.k());
|
||||
|
@ -1398,13 +1419,17 @@ namespace dlib
|
|||
float* dest_p = dest.device() + dest_k_offset * dest.nc() * dest.nr();
|
||||
const float* src_p = src.device() + src_k_offset * src.nc() * src.nr();;
|
||||
|
||||
|
||||
for (long i = 0; i < src.num_samples(); ++i)
|
||||
if (add_to)
|
||||
{
|
||||
CHECK_CUDA(cudaMemcpyAsync(dest_p, src_p, block_size * sizeof(float), cudaMemcpyDeviceToDevice));
|
||||
|
||||
dest_p += dest_sample_size;
|
||||
src_p += src_sample_size;
|
||||
launch_kernel(_cuda_copy_tensor_add_to, max_jobs(dest.size()),
|
||||
dest_p, block_size*dest.num_samples(),
|
||||
src_p, dest_sample_size, src_sample_size, block_size);
|
||||
}
|
||||
else
|
||||
{
|
||||
launch_kernel(_cuda_copy_tensor, max_jobs(dest.size()),
|
||||
dest_p, block_size*dest.num_samples(),
|
||||
src_p, dest_sample_size, src_sample_size, block_size);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -369,6 +369,7 @@ namespace dlib
|
|||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
void copy_tensor(
|
||||
bool add_to,
|
||||
tensor& dest,
|
||||
size_t dest_k_offset,
|
||||
const tensor& src,
|
||||
|
|
|
@ -2604,7 +2604,7 @@ namespace dlib
|
|||
static void concat(tensor& out, const SUBNET& sub, size_t k_offset)
|
||||
{
|
||||
auto& t = layer<TAG_TYPE>(sub).get_output();
|
||||
tt::copy_tensor(out, k_offset, t, 0, t.k());
|
||||
tt::copy_tensor(false, out, k_offset, t, 0, t.k());
|
||||
k_offset += t.k();
|
||||
concat_helper_impl<TAG_TYPES...>::concat(out, sub, k_offset);
|
||||
}
|
||||
|
@ -2612,7 +2612,7 @@ namespace dlib
|
|||
static void split(const tensor& input, SUBNET& sub, size_t k_offset)
|
||||
{
|
||||
auto& t = layer<TAG_TYPE>(sub).get_gradient_input();
|
||||
tt::copy_tensor(t, 0, input, k_offset, t.k());
|
||||
tt::copy_tensor(true, t, 0, input, k_offset, t.k());
|
||||
k_offset += t.k();
|
||||
concat_helper_impl<TAG_TYPES...>::split(input, sub, k_offset);
|
||||
}
|
||||
|
@ -2635,13 +2635,13 @@ namespace dlib
|
|||
static void concat(tensor& out, const SUBNET& sub, size_t k_offset)
|
||||
{
|
||||
auto& t = layer<TAG_TYPE>(sub).get_output();
|
||||
tt::copy_tensor(out, k_offset, t, 0, t.k());
|
||||
tt::copy_tensor(false, out, k_offset, t, 0, t.k());
|
||||
}
|
||||
template<typename SUBNET>
|
||||
static void split(const tensor& input, SUBNET& sub, size_t k_offset)
|
||||
{
|
||||
auto& t = layer<TAG_TYPE>(sub).get_gradient_input();
|
||||
tt::copy_tensor(t, 0, input, k_offset, t.k());
|
||||
tt::copy_tensor(true, t, 0, input, k_offset, t.k());
|
||||
}
|
||||
};
|
||||
}
|
||||
|
|
|
@ -881,6 +881,7 @@ namespace dlib { namespace tt
|
|||
// ------------------------------------------------------------------------------------
|
||||
|
||||
void copy_tensor(
|
||||
bool add_to,
|
||||
tensor& dest,
|
||||
size_t dest_k_offset,
|
||||
const tensor& src,
|
||||
|
@ -889,9 +890,9 @@ namespace dlib { namespace tt
|
|||
)
|
||||
{
|
||||
#ifdef DLIB_USE_CUDA
|
||||
cuda::copy_tensor(dest, dest_k_offset, src, src_k_offset, count_k);
|
||||
cuda::copy_tensor(add_to, dest, dest_k_offset, src, src_k_offset, count_k);
|
||||
#else
|
||||
cpu::copy_tensor(dest, dest_k_offset, src, src_k_offset, count_k);
|
||||
cpu::copy_tensor(add_to, dest, dest_k_offset, src, src_k_offset, count_k);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
|
@ -1544,6 +1544,7 @@ namespace dlib { namespace tt
|
|||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
void copy_tensor(
|
||||
bool add_to,
|
||||
tensor& dest,
|
||||
size_t dest_k_offset,
|
||||
const tensor& src,
|
||||
|
@ -1560,8 +1561,12 @@ namespace dlib { namespace tt
|
|||
- is_same_object(dest,src) == false
|
||||
- The memory areas of src and dest do not overlap.
|
||||
ensures
|
||||
- performs: dest[i, k + dest_k_offset, r, c] = src[i, k + src_k_offset, r, c], where k in [0..count_k]
|
||||
Copies content of each sample from src in to corresponding place of sample at dest.
|
||||
- if (add_to) then
|
||||
- performs: dest[i, k + dest_k_offset, r, c] += src[i, k + src_k_offset, r, c], where k in [0..count_k]
|
||||
i.e., adds content of each sample from src in to corresponding place of sample at dest.
|
||||
- else
|
||||
- performs: dest[i, k + dest_k_offset, r, c] = src[i, k + src_k_offset, r, c], where k in [0..count_k]
|
||||
i.e., copies content of each sample from src in to corresponding place of sample at dest.
|
||||
!*/
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
|
|
@ -1794,9 +1794,9 @@ namespace
|
|||
rnd.fill_gaussian(src2);
|
||||
rnd.fill_gaussian(src3);
|
||||
|
||||
cpu::copy_tensor(dest, 0, src1, 0, src1.k()); //full copy src1->dest
|
||||
cpu::copy_tensor(dest, src1.k(), src2, 0, src2.k()); //full copy src2->dest with offset of src1
|
||||
cpu::copy_tensor(dest, src1.k() + src2.k(), src3, 3, 3); //partial copy src3 into the rest place of dest
|
||||
cpu::copy_tensor(false, dest, 0, src1, 0, src1.k()); //full copy src1->dest
|
||||
cpu::copy_tensor(false, dest, src1.k(), src2, 0, src2.k()); //full copy src2->dest with offset of src1
|
||||
cpu::copy_tensor(false, dest, src1.k() + src2.k(), src3, 3, 3); //partial copy src3 into the rest place of dest
|
||||
|
||||
|
||||
for (long i = 0; i < dest.num_samples(); ++i)
|
||||
|
@ -1845,9 +1845,9 @@ namespace
|
|||
rnd.fill_gaussian(src1);
|
||||
rnd.fill_gaussian(src2);
|
||||
rnd.fill_gaussian(src3);
|
||||
cuda::copy_tensor(dest, 0, src1, 0, src1.k()); //full copy src1->dest
|
||||
cuda::copy_tensor(dest, src1.k(), src2, 0, src2.k()); //full copy src2->dest with offset of src1
|
||||
cuda::copy_tensor(dest, src1.k() + src2.k(), src3, 3, 3); //partial copy src3 into the rest place of dest
|
||||
cuda::copy_tensor(false, dest, 0, src1, 0, src1.k()); //full copy src1->dest
|
||||
cuda::copy_tensor(false, dest, src1.k(), src2, 0, src2.k()); //full copy src2->dest with offset of src1
|
||||
cuda::copy_tensor(false, dest, src1.k() + src2.k(), src3, 3, 3); //partial copy src3 into the rest place of dest
|
||||
|
||||
|
||||
for (long i = 0; i < dest.num_samples(); ++i)
|
||||
|
@ -1910,9 +1910,9 @@ namespace
|
|||
auto& b3o = layer<itag3>(net).get_output();
|
||||
|
||||
resizable_tensor dest(10, 14, 111, 222);
|
||||
copy_tensor(dest, 0, b1o, 0, b1o.k());
|
||||
copy_tensor(dest, b1o.k(), b2o, 0, b2o.k());
|
||||
copy_tensor(dest, b1o.k() + b2o.k(), b3o, 0, b3o.k());
|
||||
copy_tensor(false, dest, 0, b1o, 0, b1o.k());
|
||||
copy_tensor(false, dest, b1o.k(), b2o, 0, b2o.k());
|
||||
copy_tensor(false, dest, b1o.k() + b2o.k(), b3o, 0, b3o.k());
|
||||
|
||||
DLIB_TEST(dest.size() == out.size());
|
||||
int error = memcmp(dest.host(), out.host(), dest.size());
|
||||
|
@ -1932,9 +1932,9 @@ namespace
|
|||
resizable_tensor g2(10, 8, 111, 222);
|
||||
resizable_tensor g3(10, 1, 111, 222);
|
||||
|
||||
copy_tensor(g1, 0, gr, 0, g1.k());
|
||||
copy_tensor(g2, 0, gr, g1.k(), g2.k());
|
||||
copy_tensor(g3, 0, gr, g1.k() + g2.k(), g3.k());
|
||||
copy_tensor(false, g1, 0, gr, 0, g1.k());
|
||||
copy_tensor(false, g2, 0, gr, g1.k(), g2.k());
|
||||
copy_tensor(false, g3, 0, gr, g1.k() + g2.k(), g3.k());
|
||||
DLIB_TEST(g1.size() == b1g.size());
|
||||
error = memcmp(g1.host(), b1g.host(), b1g.size());
|
||||
DLIB_TEST(error == 0);
|
||||
|
|
Loading…
Reference in New Issue