Added an "add_to" option to tt:copy_tensor(). There was also a bug in the

concat layer's backward() method.  It was assigning the gradient to previous
layers instead of adding the gradient, as required by the layer interface
specification.  This change also noticeably speeds up concat layers since only
one CUDA kernel launch now happens per concat operation, rather than one
kernel launch for each sample in a tensor.
This commit is contained in:
Davis King 2017-08-14 12:28:26 -04:00
parent 89c9267e46
commit 7078cfaff5
8 changed files with 74 additions and 32 deletions

View File

@ -2023,6 +2023,7 @@ namespace dlib
// ------------------------------------------------------------------------------------
void copy_tensor(
bool add_to,
tensor& dest,
size_t dest_k_offset,
const tensor& src,
@ -2045,7 +2046,15 @@ namespace dlib
for (long i = 0; i < src.num_samples(); ++i)
{
::memcpy(dest_p, src_p, block_size * sizeof(float));
if (add_to)
{
for (size_t j = 0; j < block_size; ++j)
dest_p[j] += src_p[j];
}
else
{
::memcpy(dest_p, src_p, block_size * sizeof(float));
}
dest_p += dest_sample_size;
src_p += src_sample_size;

View File

@ -445,6 +445,7 @@ namespace dlib
// -----------------------------------------------------------------------------------
void copy_tensor(
bool add_to,
tensor& dest,
size_t dest_k_offset,
const tensor& src,

View File

@ -1377,12 +1377,33 @@ namespace dlib
// ----------------------------------------------------------------------------------------
__global__ void _cuda_copy_tensor_add_to (float* dest, size_t size, const float* src, size_t dest_stride, size_t src_stride, size_t block_size)
{
for(auto i : grid_stride_range(0, size))
{
size_t blk = i/block_size;
size_t j = i%block_size;
dest[blk*dest_stride + j] += src[blk*src_stride + j];
}
}
__global__ void _cuda_copy_tensor (float* dest, size_t size, const float* src, size_t dest_stride, size_t src_stride, size_t block_size)
{
for(auto i : grid_stride_range(0, size))
{
size_t blk = i/block_size;
size_t j = i%block_size;
dest[blk*dest_stride + j] = src[blk*src_stride + j];
}
}
void copy_tensor(
tensor& dest,
size_t dest_k_offset,
const tensor& src,
size_t src_k_offset,
size_t count_k
bool add_to,
tensor& dest,
size_t dest_k_offset,
const tensor& src,
size_t src_k_offset,
size_t count_k
)
{
const size_t dest_sample_size = static_cast<size_t>(dest.nc() * dest.nr() * dest.k());
@ -1398,13 +1419,17 @@ namespace dlib
float* dest_p = dest.device() + dest_k_offset * dest.nc() * dest.nr();
const float* src_p = src.device() + src_k_offset * src.nc() * src.nr();;
for (long i = 0; i < src.num_samples(); ++i)
if (add_to)
{
CHECK_CUDA(cudaMemcpyAsync(dest_p, src_p, block_size * sizeof(float), cudaMemcpyDeviceToDevice));
dest_p += dest_sample_size;
src_p += src_sample_size;
launch_kernel(_cuda_copy_tensor_add_to, max_jobs(dest.size()),
dest_p, block_size*dest.num_samples(),
src_p, dest_sample_size, src_sample_size, block_size);
}
else
{
launch_kernel(_cuda_copy_tensor, max_jobs(dest.size()),
dest_p, block_size*dest.num_samples(),
src_p, dest_sample_size, src_sample_size, block_size);
}
}

View File

@ -369,6 +369,7 @@ namespace dlib
// ----------------------------------------------------------------------------------------
void copy_tensor(
bool add_to,
tensor& dest,
size_t dest_k_offset,
const tensor& src,

View File

@ -2604,7 +2604,7 @@ namespace dlib
static void concat(tensor& out, const SUBNET& sub, size_t k_offset)
{
auto& t = layer<TAG_TYPE>(sub).get_output();
tt::copy_tensor(out, k_offset, t, 0, t.k());
tt::copy_tensor(false, out, k_offset, t, 0, t.k());
k_offset += t.k();
concat_helper_impl<TAG_TYPES...>::concat(out, sub, k_offset);
}
@ -2612,7 +2612,7 @@ namespace dlib
static void split(const tensor& input, SUBNET& sub, size_t k_offset)
{
auto& t = layer<TAG_TYPE>(sub).get_gradient_input();
tt::copy_tensor(t, 0, input, k_offset, t.k());
tt::copy_tensor(true, t, 0, input, k_offset, t.k());
k_offset += t.k();
concat_helper_impl<TAG_TYPES...>::split(input, sub, k_offset);
}
@ -2635,13 +2635,13 @@ namespace dlib
static void concat(tensor& out, const SUBNET& sub, size_t k_offset)
{
auto& t = layer<TAG_TYPE>(sub).get_output();
tt::copy_tensor(out, k_offset, t, 0, t.k());
tt::copy_tensor(false, out, k_offset, t, 0, t.k());
}
template<typename SUBNET>
static void split(const tensor& input, SUBNET& sub, size_t k_offset)
{
auto& t = layer<TAG_TYPE>(sub).get_gradient_input();
tt::copy_tensor(t, 0, input, k_offset, t.k());
tt::copy_tensor(true, t, 0, input, k_offset, t.k());
}
};
}

View File

@ -881,6 +881,7 @@ namespace dlib { namespace tt
// ------------------------------------------------------------------------------------
void copy_tensor(
bool add_to,
tensor& dest,
size_t dest_k_offset,
const tensor& src,
@ -889,9 +890,9 @@ namespace dlib { namespace tt
)
{
#ifdef DLIB_USE_CUDA
cuda::copy_tensor(dest, dest_k_offset, src, src_k_offset, count_k);
cuda::copy_tensor(add_to, dest, dest_k_offset, src, src_k_offset, count_k);
#else
cpu::copy_tensor(dest, dest_k_offset, src, src_k_offset, count_k);
cpu::copy_tensor(add_to, dest, dest_k_offset, src, src_k_offset, count_k);
#endif
}

View File

@ -1544,6 +1544,7 @@ namespace dlib { namespace tt
// ----------------------------------------------------------------------------------------
void copy_tensor(
bool add_to,
tensor& dest,
size_t dest_k_offset,
const tensor& src,
@ -1560,8 +1561,12 @@ namespace dlib { namespace tt
- is_same_object(dest,src) == false
- The memory areas of src and dest do not overlap.
ensures
- performs: dest[i, k + dest_k_offset, r, c] = src[i, k + src_k_offset, r, c], where k in [0..count_k]
Copies content of each sample from src in to corresponding place of sample at dest.
- if (add_to) then
- performs: dest[i, k + dest_k_offset, r, c] += src[i, k + src_k_offset, r, c], where k in [0..count_k]
i.e., adds content of each sample from src in to corresponding place of sample at dest.
- else
- performs: dest[i, k + dest_k_offset, r, c] = src[i, k + src_k_offset, r, c], where k in [0..count_k]
i.e., copies content of each sample from src in to corresponding place of sample at dest.
!*/
// ----------------------------------------------------------------------------------------

View File

@ -1794,9 +1794,9 @@ namespace
rnd.fill_gaussian(src2);
rnd.fill_gaussian(src3);
cpu::copy_tensor(dest, 0, src1, 0, src1.k()); //full copy src1->dest
cpu::copy_tensor(dest, src1.k(), src2, 0, src2.k()); //full copy src2->dest with offset of src1
cpu::copy_tensor(dest, src1.k() + src2.k(), src3, 3, 3); //partial copy src3 into the rest place of dest
cpu::copy_tensor(false, dest, 0, src1, 0, src1.k()); //full copy src1->dest
cpu::copy_tensor(false, dest, src1.k(), src2, 0, src2.k()); //full copy src2->dest with offset of src1
cpu::copy_tensor(false, dest, src1.k() + src2.k(), src3, 3, 3); //partial copy src3 into the rest place of dest
for (long i = 0; i < dest.num_samples(); ++i)
@ -1845,9 +1845,9 @@ namespace
rnd.fill_gaussian(src1);
rnd.fill_gaussian(src2);
rnd.fill_gaussian(src3);
cuda::copy_tensor(dest, 0, src1, 0, src1.k()); //full copy src1->dest
cuda::copy_tensor(dest, src1.k(), src2, 0, src2.k()); //full copy src2->dest with offset of src1
cuda::copy_tensor(dest, src1.k() + src2.k(), src3, 3, 3); //partial copy src3 into the rest place of dest
cuda::copy_tensor(false, dest, 0, src1, 0, src1.k()); //full copy src1->dest
cuda::copy_tensor(false, dest, src1.k(), src2, 0, src2.k()); //full copy src2->dest with offset of src1
cuda::copy_tensor(false, dest, src1.k() + src2.k(), src3, 3, 3); //partial copy src3 into the rest place of dest
for (long i = 0; i < dest.num_samples(); ++i)
@ -1910,9 +1910,9 @@ namespace
auto& b3o = layer<itag3>(net).get_output();
resizable_tensor dest(10, 14, 111, 222);
copy_tensor(dest, 0, b1o, 0, b1o.k());
copy_tensor(dest, b1o.k(), b2o, 0, b2o.k());
copy_tensor(dest, b1o.k() + b2o.k(), b3o, 0, b3o.k());
copy_tensor(false, dest, 0, b1o, 0, b1o.k());
copy_tensor(false, dest, b1o.k(), b2o, 0, b2o.k());
copy_tensor(false, dest, b1o.k() + b2o.k(), b3o, 0, b3o.k());
DLIB_TEST(dest.size() == out.size());
int error = memcmp(dest.host(), out.host(), dest.size());
@ -1932,9 +1932,9 @@ namespace
resizable_tensor g2(10, 8, 111, 222);
resizable_tensor g3(10, 1, 111, 222);
copy_tensor(g1, 0, gr, 0, g1.k());
copy_tensor(g2, 0, gr, g1.k(), g2.k());
copy_tensor(g3, 0, gr, g1.k() + g2.k(), g3.k());
copy_tensor(false, g1, 0, gr, 0, g1.k());
copy_tensor(false, g2, 0, gr, g1.k(), g2.k());
copy_tensor(false, g3, 0, gr, g1.k() + g2.k(), g3.k());
DLIB_TEST(g1.size() == b1g.size());
error = memcmp(g1.host(), b1g.host(), b1g.size());
DLIB_TEST(error == 0);