mirror of https://github.com/davisking/dlib.git
Added multiply_zero_padded()
This commit is contained in:
parent
46a02d9447
commit
f7310f4bbc
|
@ -265,6 +265,79 @@ namespace dlib
|
|||
}
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
void multiply_zero_padded (
|
||||
bool add_to,
|
||||
tensor& dest,
|
||||
const tensor& src1,
|
||||
const tensor& src2
|
||||
)
|
||||
{
|
||||
auto d = dest.host();
|
||||
auto s1 = src1.host();
|
||||
auto s2 = src2.host();
|
||||
|
||||
// Do the simple and fast version if everything has the same dimensions
|
||||
if (have_same_dimensions(dest, src1) &&
|
||||
have_same_dimensions(dest, src2))
|
||||
{
|
||||
if (add_to)
|
||||
{
|
||||
for (size_t i = 0; i < dest.size(); ++i)
|
||||
d[i] += s1[i] * s2[i];
|
||||
}
|
||||
else
|
||||
{
|
||||
for (size_t i = 0; i < dest.size(); ++i)
|
||||
d[i] = s1[i] * s2[i];
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// Otherwise, do the more complex version with bounds checking.
|
||||
for (long n = 0; n < dest.num_samples(); ++n)
|
||||
{
|
||||
for (long k = 0; k < dest.k(); ++k)
|
||||
{
|
||||
for (long r = 0; r < dest.nr(); ++r)
|
||||
{
|
||||
for (long c = 0; c < dest.nc(); ++c)
|
||||
{
|
||||
float v1 = 0;
|
||||
float v2 = 0;
|
||||
|
||||
// if this index is inside src1
|
||||
if (n < src1.num_samples() &&
|
||||
k < src1.k() &&
|
||||
r < src1.nr() &&
|
||||
c < src1.nc() )
|
||||
{
|
||||
const auto s_idx = ((n*src1.k() + k)*src1.nr() + r)*src1.nc() + c;
|
||||
v1 = s1[s_idx];
|
||||
}
|
||||
|
||||
// if this index is inside src2
|
||||
if (n < src2.num_samples() &&
|
||||
k < src2.k() &&
|
||||
r < src2.nr() &&
|
||||
c < src2.nc() )
|
||||
{
|
||||
const auto s_idx = ((n*src2.k() + k)*src2.nr() + r)*src2.nc() + c;
|
||||
v2 = s2[s_idx];
|
||||
}
|
||||
|
||||
if (add_to)
|
||||
*d += v1 * v2;
|
||||
else
|
||||
*d = v1 * v2;
|
||||
++d;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
void assign_bias_gradient (
|
||||
|
|
|
@ -30,6 +30,13 @@ namespace dlib
|
|||
const tensor& src2
|
||||
);
|
||||
|
||||
void multiply_zero_padded (
|
||||
bool add_to,
|
||||
tensor& dest,
|
||||
const tensor& src1,
|
||||
const tensor& src2
|
||||
);
|
||||
|
||||
void add(
|
||||
float beta,
|
||||
tensor& dest,
|
||||
|
|
|
@ -513,6 +513,134 @@ namespace dlib
|
|||
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------------------------
|
||||
|
||||
__global__ void _cuda_mult1(float* d, const float* s1, const float* s2, size_t n)
|
||||
{
|
||||
for (auto i : grid_stride_range(0, n))
|
||||
{
|
||||
d[i] = s1[i]*s2[i];
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void _cuda_mult1_add_to(float* d, const float* s1, const float* s2, size_t n)
|
||||
{
|
||||
for (auto i : grid_stride_range(0, n))
|
||||
{
|
||||
d[i] += s1[i]*s2[i];
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void _cuda_mult2(float* d, const float* s1, const float* s2,
|
||||
size_t dn, size_t dk, size_t dr, size_t dc,
|
||||
size_t s1n, size_t s1k, size_t s1r, size_t s1c,
|
||||
size_t s2n, size_t s2k, size_t s2r, size_t s2c)
|
||||
{
|
||||
for (auto i : grid_stride_range(0, dn*dk*dr*dc))
|
||||
{
|
||||
size_t n,k,r,c;
|
||||
unpack_idx(i, dk,dr,dc, n,k,r,c);
|
||||
|
||||
float v1 = 0;
|
||||
float v2 = 0;
|
||||
|
||||
if (n < s1n &&
|
||||
k < s1k &&
|
||||
r < s1r &&
|
||||
c < s1c )
|
||||
{
|
||||
v1 = s1[pack_idx(s1k,s1r,s1c, n,k,r,c)];
|
||||
}
|
||||
|
||||
if (n < s2n &&
|
||||
k < s2k &&
|
||||
r < s2r &&
|
||||
c < s2c )
|
||||
{
|
||||
v2 = s2[pack_idx(s2k,s2r,s2c, n,k,r,c)];
|
||||
}
|
||||
|
||||
d[i] = v1*v2;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void _cuda_mult2_add_to(float* d, const float* s1, const float* s2,
|
||||
size_t dn, size_t dk, size_t dr, size_t dc,
|
||||
size_t s1n, size_t s1k, size_t s1r, size_t s1c,
|
||||
size_t s2n, size_t s2k, size_t s2r, size_t s2c)
|
||||
{
|
||||
for (auto i : grid_stride_range(0, dn*dk*dr*dc))
|
||||
{
|
||||
size_t n,k,r,c;
|
||||
unpack_idx(i, dk,dr,dc, n,k,r,c);
|
||||
|
||||
float v1 = 0;
|
||||
float v2 = 0;
|
||||
|
||||
if (n < s1n &&
|
||||
k < s1k &&
|
||||
r < s1r &&
|
||||
c < s1c )
|
||||
{
|
||||
v1 = s1[pack_idx(s1k,s1r,s1c, n,k,r,c)];
|
||||
}
|
||||
|
||||
if (n < s2n &&
|
||||
k < s2k &&
|
||||
r < s2r &&
|
||||
c < s2c )
|
||||
{
|
||||
v2 = s2[pack_idx(s2k,s2r,s2c, n,k,r,c)];
|
||||
}
|
||||
|
||||
d[i] += v1*v2;
|
||||
}
|
||||
}
|
||||
|
||||
void multiply_zero_padded (
|
||||
bool add_to,
|
||||
tensor& dest,
|
||||
const tensor& src1,
|
||||
const tensor& src2
|
||||
)
|
||||
{
|
||||
if (dest.size() == 0)
|
||||
return;
|
||||
|
||||
// Do the simple and fast version if everything has the same dimensions
|
||||
if (have_same_dimensions(dest, src1) &&
|
||||
have_same_dimensions(dest, src2))
|
||||
{
|
||||
if (add_to)
|
||||
launch_kernel(_cuda_mult1_add_to,max_jobs(dest.size()), dest.device(), src1.device(), src2.device(), dest.size());
|
||||
else
|
||||
launch_kernel(_cuda_mult1,max_jobs(dest.size()), dest.device(), src1.device(), src2.device(), dest.size());
|
||||
}
|
||||
else
|
||||
{
|
||||
if (add_to)
|
||||
{
|
||||
// Otherwise, do the more complex version with bounds checking.
|
||||
launch_kernel(_cuda_mult2_add_to,max_jobs(dest.size()),
|
||||
dest.device(), src1.device(), src2.device(),
|
||||
dest.num_samples(), dest.k(), dest.nr(), dest.nc(),
|
||||
src1.num_samples(), src1.k(), src1.nr(), src1.nc(),
|
||||
src2.num_samples(), src2.k(), src2.nr(), src2.nc()
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Otherwise, do the more complex version with bounds checking.
|
||||
launch_kernel(_cuda_mult2,max_jobs(dest.size()),
|
||||
dest.device(), src1.device(), src2.device(),
|
||||
dest.num_samples(), dest.k(), dest.nr(), dest.nc(),
|
||||
src1.num_samples(), src1.k(), src1.nr(), src1.nc(),
|
||||
src2.num_samples(), src2.k(), src2.nr(), src2.nc()
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------------------------
|
||||
|
||||
__global__ void _cuda_add1(float* d, const float* s1, const float* s2, size_t n)
|
||||
|
|
|
@ -185,6 +185,13 @@ namespace dlib
|
|||
const tensor& src2
|
||||
);
|
||||
|
||||
void multiply_zero_padded (
|
||||
bool add_to,
|
||||
tensor& dest,
|
||||
const tensor& src1,
|
||||
const tensor& src2
|
||||
);
|
||||
|
||||
void add (
|
||||
tensor& dest,
|
||||
const tensor& src1,
|
||||
|
|
|
@ -304,6 +304,20 @@ namespace dlib { namespace tt
|
|||
#endif
|
||||
}
|
||||
|
||||
void multiply_zero_padded (
|
||||
bool add_to,
|
||||
tensor& dest,
|
||||
const tensor& src1,
|
||||
const tensor& src2
|
||||
)
|
||||
{
|
||||
#ifdef DLIB_USE_CUDA
|
||||
cuda::multiply_zero_padded(add_to, dest, src1, src2);
|
||||
#else
|
||||
cpu::multiply_zero_padded(add_to, dest, src1, src2);
|
||||
#endif
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
void affine_transform(
|
||||
|
|
|
@ -306,6 +306,23 @@ namespace dlib { namespace tt
|
|||
- Instead of assigning the result to dest, this function adds the result to dest.
|
||||
!*/
|
||||
|
||||
void multiply_zero_padded (
|
||||
bool add_to,
|
||||
tensor& dest,
|
||||
const tensor& src1,
|
||||
const tensor& src2
|
||||
);
|
||||
/*!
|
||||
ensures
|
||||
- if (add_to) then
|
||||
- performs: dest += src1 * src2
|
||||
- else
|
||||
- performs: dest = src1 * src2
|
||||
- In either case, the multiplication happens pointwise according to 4D tensor
|
||||
arithmetic. If the dimensions don't match then missing elements are presumed
|
||||
to be equal to 0.
|
||||
!*/
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
void affine_transform(
|
||||
|
|
|
@ -904,6 +904,64 @@ namespace
|
|||
DLIB_TEST_MSG(max(abs(mat(v)-mat(vv))) < 1e-6, max(abs(mat(v)-mat(vv))));
|
||||
}
|
||||
|
||||
void test_multiply_zero_padded()
|
||||
{
|
||||
print_spinner();
|
||||
dlib::rand rnd;
|
||||
tt::tensor_rand trnd;
|
||||
for (int iter = 0; iter < 300; ++iter)
|
||||
{
|
||||
resizable_tensor dest1(rnd.get_random_32bit_number()%4+1,
|
||||
rnd.get_random_32bit_number()%4+1,
|
||||
rnd.get_random_32bit_number()%4+1,
|
||||
rnd.get_random_32bit_number()%4+1);
|
||||
resizable_tensor dest2;
|
||||
dest2.copy_size(dest1);
|
||||
resizable_tensor src1(rnd.get_random_32bit_number()%4+1,
|
||||
rnd.get_random_32bit_number()%4+1,
|
||||
rnd.get_random_32bit_number()%4+1,
|
||||
rnd.get_random_32bit_number()%4+1);
|
||||
resizable_tensor src2(rnd.get_random_32bit_number()%4+1,
|
||||
rnd.get_random_32bit_number()%4+1,
|
||||
rnd.get_random_32bit_number()%4+1,
|
||||
rnd.get_random_32bit_number()%4+1);
|
||||
|
||||
trnd.fill_uniform(dest1);
|
||||
trnd.fill_uniform(dest2);
|
||||
trnd.fill_uniform(src1);
|
||||
trnd.fill_uniform(src2);
|
||||
cpu::multiply_zero_padded(false, dest1, src1, src2);
|
||||
cuda::multiply_zero_padded(false, dest2, src1, src2);
|
||||
DLIB_TEST(max(abs(mat(dest1) - mat(dest2))) < 1e-5);
|
||||
|
||||
cpu::multiply_zero_padded(true, dest1, src1, src2);
|
||||
cuda::multiply_zero_padded(true, dest2, src1, src2);
|
||||
DLIB_TEST(max(abs(mat(dest1) - mat(dest2))) < 1e-5);
|
||||
}
|
||||
|
||||
// make sure we have a test for the case where all tensors have the same
|
||||
// dimensions.
|
||||
resizable_tensor dest1(3,4,5,6);
|
||||
resizable_tensor dest2;
|
||||
resizable_tensor src1;
|
||||
resizable_tensor src2;
|
||||
dest2.copy_size(dest1);
|
||||
src1.copy_size(dest1);
|
||||
src2.copy_size(dest1);
|
||||
|
||||
trnd.fill_uniform(dest1);
|
||||
trnd.fill_uniform(dest2);
|
||||
trnd.fill_uniform(src1);
|
||||
trnd.fill_uniform(src2);
|
||||
cpu::multiply_zero_padded(false, dest1, src1, src2);
|
||||
cuda::multiply_zero_padded(false, dest2, src1, src2);
|
||||
DLIB_TEST(max(abs(mat(dest1) - mat(dest2))) < 1e-5);
|
||||
|
||||
cpu::multiply_zero_padded(true, dest1, src1, src2);
|
||||
cuda::multiply_zero_padded(true, dest2, src1, src2);
|
||||
DLIB_TEST(max(abs(mat(dest1) - mat(dest2))) < 1e-5);
|
||||
}
|
||||
|
||||
void test_add()
|
||||
{
|
||||
print_spinner();
|
||||
|
@ -2606,6 +2664,7 @@ namespace
|
|||
compare_bn_gpu_and_cpu();
|
||||
compare_bn_conv_gpu_and_cpu();
|
||||
test_add();
|
||||
test_multiply_zero_padded();
|
||||
compare_adam();
|
||||
test_copy_tensor_gpu();
|
||||
#endif
|
||||
|
|
Loading…
Reference in New Issue