Optimized the matrix multiply a little more.

--HG--
extra : convert_revision : svn%3Afdd8eb12-d10e-0410-9acb-85c331704f74/trunk%402742
This commit is contained in:
Davis King 2008-12-19 21:54:52 +00:00
parent 7b53a1cb3f
commit ded490dc9d
2 changed files with 30 additions and 10 deletions

View File

@ -82,18 +82,25 @@ namespace dlib
!*/
{
using namespace ma;
const matrix_exp<EXP1>& lhs = src.lhs;
const matrix_exp<EXP2>& rhs = src.rhs;
const long bs = 100;
const EXP1& lhs = src.lhs;
const EXP2& rhs = src.rhs;
const long bs = 90;
set_all_elements(dest,0);
// if the matrices are small enough then just use the simple multiply algorithm
if (lhs.nc() <= 2 || rhs.nc() <= 2 || lhs.nr() <= 2 || rhs.nr() <= 2 || (lhs.size() <= bs*10 && rhs.size() <= bs*10) )
{
for (long r = 0; r < src.nr(); ++r)
// This loop is optimized assuming that the data is laid out in
// row major order in memory.
for (long r = 0; r< lhs.nr(); ++r)
{
for (long c = 0; c < src.nc(); ++c)
for (long c = 0; c< lhs.nc(); ++c)
{
dest(r,c) = src(r,c);
const typename EXP2::type temp = lhs(r,c);
for (long i = 0; i < rhs.nc(); ++i)
{
dest(r,i) += rhs(c,i)*temp;
}
}
}
}
@ -102,6 +109,7 @@ namespace dlib
// if the lhs and rhs matrices are big enough we should use a cache friendly
// algorithm that computes the matrix multiply in blocks.
// Loop over all the blocks in the lhs matrix
for (long r = 0; r < lhs.nr(); r+=bs)
{
@ -118,10 +126,20 @@ namespace dlib
// make a target rect in res
rectangle res_block(rhs_block.left(),lhs_block.top(), rhs_block.right(), lhs_block.bottom());
if (c != 0)
set_subm(dest, res_block) = subm(dest,res_block) + subm(lhs,lhs_block)*subm(rhs, rhs_block);
else
set_subm(dest, res_block) = null_exp(subm(lhs,lhs_block)*subm(rhs, rhs_block));
// This loop is optimized assuming that the data is laid out in
// row major order in memory.
for (long r = lhs_block.top(); r <= lhs_block.bottom(); ++r)
{
for (long c = lhs_block.left(); c<= lhs_block.right(); ++c)
{
const typename EXP2::type temp = lhs(r,c);
for (long i = rhs_block.left(); i <= rhs_block.right(); ++i)
{
dest(r,i) += rhs(c,i)*temp;
}
}
}
}
}
}

View File

@ -36,8 +36,10 @@ namespace dlib
// inline behavior out of GCC.
#ifdef __GNUC__
#define DLIB_DONT_INLINE __attribute__((noinline))
#define DLIB_ALWAYS_INLINE __attribute__((always_inline))
#else
#define DLIB_DONT_INLINE
#define DLIB_ALWAYS_INLINE
#endif
template <