From 1d86feaab189d55d78ff8ba30b0e6fb529d547f3 Mon Sep 17 00:00:00 2001 From: Davis King Date: Sat, 15 Jun 2013 15:20:03 -0400 Subject: [PATCH] Made the running covariance objects work with sparse vectors. --- dlib/statistics/statistics.h | 194 +++++++++++++++++++++++++- dlib/statistics/statistics_abstract.h | 88 +++++++++--- dlib/test/statistics.cpp | 75 ++++++++++ 3 files changed, 331 insertions(+), 26 deletions(-) diff --git a/dlib/statistics/statistics.h b/dlib/statistics/statistics.h index 816b8e1ea..674d380a0 100644 --- a/dlib/statistics/statistics.h +++ b/dlib/statistics/statistics.h @@ -8,6 +8,7 @@ #include #include "../algs.h" #include "../matrix.h" +#include "../sparse_vector.h" namespace dlib { @@ -646,9 +647,55 @@ namespace dlib return static_cast(total_count); } - template - void add ( - const matrix_exp& val + void set_dimension ( + long size + ) + { + // make sure requires clause is not broken + DLIB_ASSERT( size > 0, + "\t void running_covariance::set_dimension()" + << "\n\t Invalid inputs were given to this function" + << "\n\t size: " << size + << "\n\t this: " << this + ); + + clear(); + vect_size = size; + total_sum.set_size(size); + total_cov.set_size(size,size); + total_sum = 0; + total_cov = 0; + } + + template + typename disable_if >::type add ( + const T& val + ) + { + // make sure requires clause is not broken + DLIB_ASSERT(((long)max_index_plus_one(val) <= in_vector_size() && in_vector_size() > 0), + "\t void running_covariance::add()" + << "\n\t Invalid inputs were given to this function" + << "\n\t max_index_plus_one(val): " << max_index_plus_one(val) + << "\n\t in_vector_size(): " << in_vector_size() + << "\n\t this: " << this + ); + + for (typename T::const_iterator i = val.begin(); i != val.end(); ++i) + { + total_sum(i->first) += i->second; + for (typename T::const_iterator j = val.begin(); j != val.end(); ++j) + { + total_cov(i->first, j->first) += i->second*j->second; + } + } + + ++total_count; + } + + template + typename enable_if >::type add ( + const T& val ) { // make sure requires clause is not broken @@ -810,16 +857,149 @@ namespace dlib return y_vect_size; } + void set_dimensions ( + long x_size, + long y_size + ) + { + // make sure requires clause is not broken + DLIB_ASSERT( x_size > 0 && y_size > 0, + "\t void running_cross_covariance::set_dimensions()" + << "\n\t Invalid inputs were given to this function" + << "\n\t x_size: " << x_size + << "\n\t y_size: " << y_size + << "\n\t this: " << this + ); + + clear(); + x_vect_size = x_size; + y_vect_size = y_size; + sum_x.set_size(x_size); + sum_y.set_size(y_size); + total_cov.set_size(x_size,y_size); + + sum_x = 0; + sum_y = 0; + total_cov = 0; + } + long current_n ( ) const { return static_cast(total_count); } - template - void add ( - const matrix_exp& x, - const matrix_exp& y + template + typename enable_if_c::value && !is_matrix::value>::type add ( + const T& x, + const U& y + ) + { + // make sure requires clause is not broken + DLIB_ASSERT( ((long)max_index_plus_one(x) <= x_vector_size() && x_vector_size() > 0) && + ((long)max_index_plus_one(y) <= y_vector_size() && y_vector_size() > 0) , + "\t void running_cross_covariance::add()" + << "\n\t Invalid inputs were given to this function" + << "\n\t max_index_plus_one(x): " << max_index_plus_one(x) + << "\n\t max_index_plus_one(y): " << max_index_plus_one(y) + << "\n\t x_vector_size(): " << x_vector_size() + << "\n\t y_vector_size(): " << y_vector_size() + << "\n\t this: " << this + ); + + for (typename T::const_iterator i = x.begin(); i != x.end(); ++i) + { + sum_x(i->first) += i->second; + for (typename U::const_iterator j = y.begin(); j != y.end(); ++j) + { + total_cov(i->first, j->first) += i->second*j->second; + } + } + + // do sum_y += y + for (typename U::const_iterator j = y.begin(); j != y.end(); ++j) + { + sum_y(j->first) += j->second; + } + + ++total_count; + } + + template + typename enable_if_c::value && !is_matrix::value>::type add ( + const T& x, + const U& y + ) + { + // make sure requires clause is not broken + DLIB_ASSERT( (is_col_vector(x) && x.size() == x_vector_size() && x_vector_size() > 0) && + ((long)max_index_plus_one(y) <= y_vector_size() && y_vector_size() > 0) , + "\t void running_cross_covariance::add()" + << "\n\t Invalid inputs were given to this function" + << "\n\t is_col_vector(x): " << is_col_vector(x) + << "\n\t x.size(): " << x.size() + << "\n\t max_index_plus_one(y): " << max_index_plus_one(y) + << "\n\t x_vector_size(): " << x_vector_size() + << "\n\t y_vector_size(): " << y_vector_size() + << "\n\t this: " << this + ); + + sum_x += x; + + for (long i = 0; i < x.size(); ++i) + { + for (typename U::const_iterator j = y.begin(); j != y.end(); ++j) + { + total_cov(i, j->first) += x(i)*j->second; + } + } + + // do sum_y += y + for (typename U::const_iterator j = y.begin(); j != y.end(); ++j) + { + sum_y(j->first) += j->second; + } + + ++total_count; + } + + template + typename enable_if_c::value && is_matrix::value>::type add ( + const T& x, + const U& y + ) + { + // make sure requires clause is not broken + DLIB_ASSERT( ((long)max_index_plus_one(x) <= x_vector_size() && x_vector_size() > 0) && + (is_col_vector(y) && y.size() == (long)y_vector_size() && y_vector_size() > 0) , + "\t void running_cross_covariance::add()" + << "\n\t Invalid inputs were given to this function" + << "\n\t max_index_plus_one(x): " << max_index_plus_one(x) + << "\n\t is_col_vector(y): " << is_col_vector(y) + << "\n\t y.size(): " << y.size() + << "\n\t x_vector_size(): " << x_vector_size() + << "\n\t y_vector_size(): " << y_vector_size() + << "\n\t this: " << this + ); + + for (typename T::const_iterator i = x.begin(); i != x.end(); ++i) + { + sum_x(i->first) += i->second; + for (long j = 0; j < y.size(); ++j) + { + total_cov(i->first, j) += i->second*y(j); + } + } + + sum_y += y; + + ++total_count; + } + + template + typename enable_if_c::value && is_matrix::value>::type add ( + const T& x, + const U& y ) { // make sure requires clause is not broken diff --git a/dlib/statistics/statistics_abstract.h b/dlib/statistics/statistics_abstract.h index 730e2b9ae..d6630fcd0 100644 --- a/dlib/statistics/statistics_abstract.h +++ b/dlib/statistics/statistics_abstract.h @@ -6,6 +6,7 @@ #include #include #include "../matrix/matrix_abstract.h" +#include "../svm/sparse_vector_abstract.h" namespace dlib { @@ -472,18 +473,41 @@ namespace dlib - returns 0 !*/ - void add ( - const matrix_exp& val + void set_dimension ( + long size ); /*! requires - - is_col_vector(val) == true - - if (in_vector_size() != 0) then - - val.size() == in_vector_size() + - size > 0 + ensures + - #in_vector_size() == size + - #current_n() == 0 + !*/ + + template + void add ( + const T& val + ); + /*! + requires + - val must represent a column vector. It can either be a dlib::matrix + object or some kind of unsorted sparse vector type. See the top of + dlib/svm/sparse_vector_abstract.h for a definition of unsorted sparse vector. + - val must have a number of dimensions which is compatible with the current + setting of in_vector_size(). In particular, this means that the + following must hold: + - if (val is a dlib::matrix) then + - in_vector_size() == 0 || val.size() == val_vector_size() + - else + - max_index_plus_one(val) <= in_vector_size() + - in_vector_size() > 0 + (i.e. you must call set_dimension() prior to calling add() if + you want to use sparse vectors.) ensures - updates the mean and covariance stored in this object so that the new value is factored into them. - - #in_vector_size() == val.size() + - if (val is a dlib::matrix) then + - #in_vector_size() == val.size() !*/ const column_matrix mean ( @@ -586,6 +610,20 @@ namespace dlib - returns 0 !*/ + void set_dimensions ( + long x_size, + long y_size + ); + /*! + requires + - x_size > 0 + - y_size > 0 + ensures + - #x_vector_size() == x_size + - #y_vector_size() == y_size + - #current_n() == 0 + !*/ + long current_n ( ) const; /*! @@ -593,26 +631,38 @@ namespace dlib - returns the number of samples that have been presented to this object. !*/ - template + template void add ( - const matrix_exp& x, - const matrix_exp& y + const T& x, + const U& y ); /*! requires - - is_col_vector(x) == true - - is_col_vector(y) == true - - x.size() != 0 - - y.size() != 0 - - if (x_vector_size() != 0) then - - x.size() == x_vector_size() - - if (y_vector_size() != 0) then - - y.size() == y_vector_size() + - x and y must represent column vectors. They can either be dlib::matrix + objects or some kind of unsorted sparse vector type. See the top of + dlib/svm/sparse_vector_abstract.h for a definition of unsorted sparse vector. + - x and y must have a number of dimensions which is compatible with the + current setting of x_vector_size() and y_vector_size(). In particular, + this means that the following must hold: + - if (x or y is a sparse vector type) then + - x_vector_size() > 0 && y_vector_size() > 0 + (i.e. you must call set_dimensions() prior to calling add() if + you want to use sparse vectors.) + - if (x is a dlib::matrix) then + - x_vector_size() == 0 || x.size() == x_vector_size() + - else + - max_index_plus_one(x) <= x_vector_size() + - if (y is a dlib::matrix) then + - y_vector_size() == 0 || y.size() == y_vector_size() + - else + - max_index_plus_one(y) <= y_vector_size() ensures - updates the mean and cross-covariance matrices stored in this object so that the new (x,y) vector pair is factored into them. - - #x_vector_size() == x.size() - - #y_vector_size() == y.size() + - if (x is a dlib::matrix) then + - #x_vector_size() == x.size() + - if (y is a dlib::matrix) then + - #y_vector_size() == y.size() !*/ const column_matrix mean_x ( diff --git a/dlib/test/statistics.cpp b/dlib/test/statistics.cpp index 9841dce54..7dfa61175 100644 --- a/dlib/test/statistics.cpp +++ b/dlib/test/statistics.cpp @@ -171,6 +171,80 @@ namespace DLIB_TEST(max(abs(rcc.mean_y()-ym)) < 1e-14); } + std::map dense_to_sparse ( + const matrix& x + ) + { + std::map temp; + for (long i = 0; i < x.size(); ++i) + temp[i] = x(i); + return temp; + } + + void test_running_cross_covariance_sparse() + { + running_cross_covariance > rcc1, rcc2; + + running_covariance > rc1, rc2; + + matrix xm, ym; + const int num = 40; + + rc1.set_dimension(4); + rc2.set_dimension(4); + + rcc1.set_dimensions(4,5); + rcc2.set_dimensions(4,5); + + dlib::rand rnd; + for (int i = 0; i < num; ++i) + { + matrix x = randm(4,1,rnd); + matrix y = randm(5,1,rnd); + + xm += x/num; + ym += y/num; + + if (i < 15) + { + rcc1.add(x,dense_to_sparse(y)); + rc1.add(x); + } + else if (i < 30) + { + rcc2.add(dense_to_sparse(x),y); + rc2.add(dense_to_sparse(x)); + } + else + { + rcc2.add(dense_to_sparse(x),dense_to_sparse(y)); + rc2.add(x); + } + } + + rnd.clear(); + matrix cov, cov2; + for (int i = 0; i < num; ++i) + { + matrix x = randm(4,1,rnd); + matrix y = randm(5,1,rnd); + cov += (x-xm)*trans(y-ym); + cov2 += (x-xm)*trans(x-xm); + } + cov /= num-1; + cov2 /= num-1; + + running_cross_covariance > rcc = rcc1 + rcc2; + cout << rcc.covariance_xy()-cov << endl; + DLIB_TEST_MSG(max(abs(rcc.covariance_xy()-cov)) < 1e-14, max(abs(rcc.covariance_xy()-cov))); + DLIB_TEST(max(abs(rcc.mean_x()-xm)) < 1e-14); + DLIB_TEST(max(abs(rcc.mean_y()-ym)) < 1e-14); + + running_covariance > rc = rc1 + rc2; + DLIB_TEST(max(abs(rc.covariance()-cov2)) < 1e-14); + DLIB_TEST(max(abs(rc.mean()-xm)) < 1e-14); + } + void test_running_covariance ( ) { @@ -498,6 +572,7 @@ namespace test_random_subset_selector2(); test_running_covariance(); test_running_cross_covariance(); + test_running_cross_covariance_sparse(); test_running_stats(); test_skewness_and_kurtosis_1(); test_skewness_and_kurtosis_2();