From 1d86feaab189d55d78ff8ba30b0e6fb529d547f3 Mon Sep 17 00:00:00 2001
From: Davis King <davis@dlib.net>
Date: Sat, 15 Jun 2013 15:20:03 -0400
Subject: [PATCH] Made the running covariance objects work with sparse vectors.

---
 dlib/statistics/statistics.h          | 194 +++++++++++++++++++++++++-
 dlib/statistics/statistics_abstract.h |  88 +++++++++---
 dlib/test/statistics.cpp              |  75 ++++++++++
 3 files changed, 331 insertions(+), 26 deletions(-)
diff --git a/dlib/statistics/statistics.h b/dlib/statistics/statistics.h
index 816b8e1ea..674d380a0 100644
--- a/dlib/statistics/statistics.h
+++ b/dlib/statistics/statistics.h
@@ -8,6 +8,7 @@
 #include <cmath>
 #include "../algs.h"
 #include "../matrix.h"
+#include "../sparse_vector.h"
 
 namespace dlib
 {
@@ -646,9 +647,55 @@ namespace dlib
             return static_cast<long>(total_count);
         }
 
-        template <typename EXP>
-        void add (
-            const matrix_exp<EXP>& val
+        void set_dimension (
+            long size
+        )
+        {
+            // make sure requires clause is not broken
+            DLIB_ASSERT( size > 0,
+                "\t void running_covariance::set_dimension()"
+                << "\n\t Invalid inputs were given to this function"
+                << "\n\t size: " << size 
+                << "\n\t this: " << this
+                );
+
+            clear();
+            vect_size = size;
+            total_sum.set_size(size);
+            total_cov.set_size(size,size);
+            total_sum = 0;
+            total_cov = 0;
+        }
+
+        template <typename T>
+        typename disable_if<is_matrix<T> >::type add (
+            const T& val
+        )
+        {
+            // make sure requires clause is not broken
+            DLIB_ASSERT(((long)max_index_plus_one(val) <= in_vector_size() && in_vector_size() > 0),
+                "\t void running_covariance::add()"
+                << "\n\t Invalid inputs were given to this function"
+                << "\n\t max_index_plus_one(val): " << max_index_plus_one(val) 
+                << "\n\t in_vector_size():        " << in_vector_size() 
+                << "\n\t this:                    " << this
+                );
+
+            for (typename T::const_iterator i = val.begin(); i != val.end(); ++i)
+            {
+                total_sum(i->first) += i->second;
+                for (typename T::const_iterator j = val.begin(); j != val.end(); ++j)
+                {
+                    total_cov(i->first, j->first) += i->second*j->second;
+                }
+            }
+
+            ++total_count;
+        }
+
+        template <typename T>
+        typename enable_if<is_matrix<T> >::type add (
+            const T& val
         )
         {
             // make sure requires clause is not broken
@@ -810,16 +857,149 @@ namespace dlib
             return y_vect_size;
         }
 
+        void set_dimensions (
+            long x_size,
+            long y_size
+        )
+        {
+            // make sure requires clause is not broken
+            DLIB_ASSERT( x_size > 0 && y_size > 0,
+                "\t void running_cross_covariance::set_dimensions()"
+                << "\n\t Invalid inputs were given to this function"
+                << "\n\t x_size: " << x_size 
+                << "\n\t y_size: " << y_size 
+                << "\n\t this:   " << this
+                );
+
+            clear();
+            x_vect_size = x_size;
+            y_vect_size = y_size;
+            sum_x.set_size(x_size);
+            sum_y.set_size(y_size);
+            total_cov.set_size(x_size,y_size);
+
+            sum_x = 0;
+            sum_y = 0;
+            total_cov = 0;
+        }
+
         long current_n (
         ) const
         {
             return static_cast<long>(total_count);
         }
 
-        template <typename EXP>
-        void add (
-            const matrix_exp<EXP>& x,
-            const matrix_exp<EXP>& y
+        template <typename T, typename U>
+        typename enable_if_c<!is_matrix<T>::value && !is_matrix<U>::value>::type add (
+            const T& x,
+            const U& y
+        )
+        {
+            // make sure requires clause is not broken
+            DLIB_ASSERT( ((long)max_index_plus_one(x) <= x_vector_size() && x_vector_size() > 0) &&
+                         ((long)max_index_plus_one(y) <= y_vector_size() && y_vector_size() > 0) ,
+                "\t void running_cross_covariance::add()"
+                << "\n\t Invalid inputs were given to this function"
+                << "\n\t max_index_plus_one(x): " << max_index_plus_one(x) 
+                << "\n\t max_index_plus_one(y): " << max_index_plus_one(y) 
+                << "\n\t x_vector_size():       " << x_vector_size() 
+                << "\n\t y_vector_size():       " << y_vector_size() 
+                << "\n\t this:                  " << this
+                );
+
+            for (typename T::const_iterator i = x.begin(); i != x.end(); ++i)
+            {
+                sum_x(i->first) += i->second;
+                for (typename U::const_iterator j = y.begin(); j != y.end(); ++j)
+                {
+                    total_cov(i->first, j->first) += i->second*j->second;
+                }
+            }
+
+            // do sum_y += y
+            for (typename U::const_iterator j = y.begin(); j != y.end(); ++j)
+            {
+                sum_y(j->first) += j->second;
+            }
+
+            ++total_count;
+        }
+
+        template <typename T, typename U>
+        typename enable_if_c<is_matrix<T>::value && !is_matrix<U>::value>::type add (
+            const T& x,
+            const U& y
+        )
+        {
+            // make sure requires clause is not broken
+            DLIB_ASSERT( (is_col_vector(x) && x.size() == x_vector_size() && x_vector_size() > 0) &&
+                         ((long)max_index_plus_one(y) <= y_vector_size() && y_vector_size() > 0) ,
+                "\t void running_cross_covariance::add()"
+                << "\n\t Invalid inputs were given to this function"
+                << "\n\t is_col_vector(x):      " << is_col_vector(x) 
+                << "\n\t x.size():              " << x.size() 
+                << "\n\t max_index_plus_one(y): " << max_index_plus_one(y) 
+                << "\n\t x_vector_size():       " << x_vector_size() 
+                << "\n\t y_vector_size():       " << y_vector_size() 
+                << "\n\t this:                  " << this
+                );
+
+            sum_x += x;
+
+            for (long i = 0; i < x.size(); ++i)
+            {
+                for (typename U::const_iterator j = y.begin(); j != y.end(); ++j)
+                {
+                    total_cov(i, j->first) += x(i)*j->second;
+                }
+            }
+
+            // do sum_y += y
+            for (typename U::const_iterator j = y.begin(); j != y.end(); ++j)
+            {
+                sum_y(j->first) += j->second;
+            }
+
+            ++total_count;
+        }
+
+        template <typename T, typename U>
+        typename enable_if_c<!is_matrix<T>::value && is_matrix<U>::value>::type add (
+            const T& x,
+            const U& y
+        )
+        {
+            // make sure requires clause is not broken
+            DLIB_ASSERT( ((long)max_index_plus_one(x) <= x_vector_size() && x_vector_size() > 0) &&
+                         (is_col_vector(y) && y.size() == (long)y_vector_size() && y_vector_size() > 0) ,
+                "\t void running_cross_covariance::add()"
+                << "\n\t Invalid inputs were given to this function"
+                << "\n\t max_index_plus_one(x): " << max_index_plus_one(x) 
+                << "\n\t is_col_vector(y):      " << is_col_vector(y) 
+                << "\n\t y.size():              " << y.size() 
+                << "\n\t x_vector_size():       " << x_vector_size() 
+                << "\n\t y_vector_size():       " << y_vector_size() 
+                << "\n\t this:                  " << this
+                );
+
+            for (typename T::const_iterator i = x.begin(); i != x.end(); ++i)
+            {
+                sum_x(i->first) += i->second;
+                for (long j = 0; j < y.size(); ++j)
+                {
+                    total_cov(i->first, j) += i->second*y(j);
+                }
+            }
+
+            sum_y += y;
+
+            ++total_count;
+        }
+
+        template <typename T, typename U>
+        typename enable_if_c<is_matrix<T>::value && is_matrix<U>::value>::type add (
+            const T& x,
+            const U& y
         )
         {
             // make sure requires clause is not broken
diff --git a/dlib/statistics/statistics_abstract.h b/dlib/statistics/statistics_abstract.h
index 730e2b9ae..d6630fcd0 100644
--- a/dlib/statistics/statistics_abstract.h
+++ b/dlib/statistics/statistics_abstract.h
@@ -6,6 +6,7 @@
 #include <limits>
 #include <cmath>
 #include "../matrix/matrix_abstract.h"
+#include "../svm/sparse_vector_abstract.h"
 
 namespace dlib
 {
@@ -472,18 +473,41 @@ namespace dlib
                     - returns 0
         !*/
 
-        void add (
-            const matrix_exp& val
+        void set_dimension (
+            long size
         );
         /*!
             requires
-                - is_col_vector(val) == true
-                - if (in_vector_size() != 0) then
-                    - val.size() == in_vector_size()
+                - size > 0
+            ensures
+                - #in_vector_size() == size
+                - #current_n() == 0
+        !*/
+
+        template <typename T>
+        void add (
+            const T& val
+        );
+        /*!
+            requires
+                - val must represent a column vector.  It can either be a dlib::matrix
+                  object or some kind of unsorted sparse vector type.  See the top of
+                  dlib/svm/sparse_vector_abstract.h for a definition of unsorted sparse vector.
+                - val must have a number of dimensions which is compatible with the current
+                  setting of in_vector_size().  In particular, this means that the
+                  following must hold:
+                    - if (val is a dlib::matrix) then 
+                        - in_vector_size() == 0 || val.size() == val_vector_size()
+                    - else
+                        - max_index_plus_one(val) <= in_vector_size()
+                        - in_vector_size() > 0 
+                          (i.e. you must call set_dimension() prior to calling add() if
+                          you want to use sparse vectors.)
             ensures
                 - updates the mean and covariance stored in this object so that
                   the new value is factored into them.
-                - #in_vector_size() == val.size()
+                - if (val is a dlib::matrix) then
+                    - #in_vector_size() == val.size()
         !*/
 
         const column_matrix mean (
@@ -586,6 +610,20 @@ namespace dlib
                     - returns 0
         !*/
 
+        void set_dimensions (
+            long x_size,
+            long y_size
+        );
+        /*!
+            requires
+                - x_size > 0
+                - y_size > 0
+            ensures
+                - #x_vector_size() == x_size
+                - #y_vector_size() == y_size
+                - #current_n() == 0
+        !*/
+
         long current_n (
         ) const;
         /*!
@@ -593,26 +631,38 @@ namespace dlib
                 - returns the number of samples that have been presented to this object.
         !*/
 
-        template <typename EXP>
+        template <typename T, typename U>
         void add (
-            const matrix_exp<EXP>& x,
-            const matrix_exp<EXP>& y
+            const T& x,
+            const U& y
         );
         /*!
             requires
-                - is_col_vector(x) == true
-                - is_col_vector(y) == true
-                - x.size() != 0
-                - y.size() != 0
-                - if (x_vector_size() != 0) then
-                    - x.size() == x_vector_size()
-                - if (y_vector_size() != 0) then
-                    - y.size() == y_vector_size()
+                - x and y must represent column vectors.  They can either be dlib::matrix
+                  objects or some kind of unsorted sparse vector type.  See the top of
+                  dlib/svm/sparse_vector_abstract.h for a definition of unsorted sparse vector.
+                - x and y must have a number of dimensions which is compatible with the
+                  current setting of x_vector_size() and y_vector_size().  In particular,
+                  this means that the following must hold:
+                    - if (x or y is a sparse vector type) then
+                        - x_vector_size() > 0 && y_vector_size() > 0
+                          (i.e. you must call set_dimensions() prior to calling add() if
+                          you want to use sparse vectors.)
+                    - if (x is a dlib::matrix) then 
+                        - x_vector_size() == 0 || x.size() == x_vector_size()
+                    - else
+                        - max_index_plus_one(x) <= x_vector_size()
+                    - if (y is a dlib::matrix) then 
+                        - y_vector_size() == 0 || y.size() == y_vector_size()
+                    - else
+                        - max_index_plus_one(y) <= y_vector_size()
             ensures
                 - updates the mean and cross-covariance matrices stored in this object so
                   that the new (x,y) vector pair is factored into them.
-                - #x_vector_size() == x.size()
-                - #y_vector_size() == y.size()
+                - if (x is a dlib::matrix) then
+                    - #x_vector_size() == x.size()
+                - if (y is a dlib::matrix) then
+                    - #y_vector_size() == y.size()
         !*/
 
         const column_matrix mean_x (
diff --git a/dlib/test/statistics.cpp b/dlib/test/statistics.cpp
index 9841dce54..7dfa61175 100644
--- a/dlib/test/statistics.cpp
+++ b/dlib/test/statistics.cpp
@@ -171,6 +171,80 @@ namespace
             DLIB_TEST(max(abs(rcc.mean_y()-ym)) < 1e-14);
         }
 
+        std::map<unsigned long,double> dense_to_sparse ( 
+            const matrix<double,0,1>& x
+        )
+        {
+            std::map<unsigned long,double> temp;
+            for (long i = 0; i < x.size(); ++i)
+                temp[i] = x(i);
+            return temp;
+        }
+
+        void test_running_cross_covariance_sparse()
+        {
+            running_cross_covariance<matrix<double> > rcc1, rcc2;
+
+            running_covariance<matrix<double> > rc1, rc2;
+
+            matrix<double,0,1> xm, ym;
+            const int num = 40;
+
+            rc1.set_dimension(4);
+            rc2.set_dimension(4);
+
+            rcc1.set_dimensions(4,5);
+            rcc2.set_dimensions(4,5);
+
+            dlib::rand rnd;
+            for (int i = 0; i < num; ++i)
+            {
+                matrix<double,0,1> x = randm(4,1,rnd);
+                matrix<double,0,1> y = randm(5,1,rnd);
+
+                xm += x/num;
+                ym += y/num;
+
+                if (i < 15)
+                {
+                    rcc1.add(x,dense_to_sparse(y));
+                    rc1.add(x);
+                }
+                else if (i < 30)
+                {
+                    rcc2.add(dense_to_sparse(x),y);
+                    rc2.add(dense_to_sparse(x));
+                }
+                else
+                {
+                    rcc2.add(dense_to_sparse(x),dense_to_sparse(y));
+                    rc2.add(x);
+                }
+            }
+
+            rnd.clear();
+            matrix<double> cov, cov2;
+            for (int i = 0; i < num; ++i)
+            {
+                matrix<double,0,1> x = randm(4,1,rnd);
+                matrix<double,0,1> y = randm(5,1,rnd);
+                cov += (x-xm)*trans(y-ym);
+                cov2 += (x-xm)*trans(x-xm);
+            }
+            cov /= num-1;
+            cov2 /= num-1;
+
+            running_cross_covariance<matrix<double> > rcc = rcc1 + rcc2;
+            cout << rcc.covariance_xy()-cov << endl;
+            DLIB_TEST_MSG(max(abs(rcc.covariance_xy()-cov)) < 1e-14, max(abs(rcc.covariance_xy()-cov)));
+            DLIB_TEST(max(abs(rcc.mean_x()-xm)) < 1e-14);
+            DLIB_TEST(max(abs(rcc.mean_y()-ym)) < 1e-14);
+
+            running_covariance<matrix<double> > rc = rc1 + rc2;
+            DLIB_TEST(max(abs(rc.covariance()-cov2)) < 1e-14);
+            DLIB_TEST(max(abs(rc.mean()-xm)) < 1e-14);
+        }
+
         void test_running_covariance (
         )
         {
@@ -498,6 +572,7 @@ namespace
             test_random_subset_selector2();
             test_running_covariance();
             test_running_cross_covariance();
+            test_running_cross_covariance_sparse();
             test_running_stats();
             test_skewness_and_kurtosis_1();
             test_skewness_and_kurtosis_2();