Added spectral_cluster()

2015-02-11 07:50:27 -05:00 · 2015-02-11 07:50:27 -05:00 · f99e940b28
parent 2e5d2c46c6
commit f99e940b28
3 changed files with 122 additions and 0 deletions
--- a/dlib/clustering.h
+++ b/dlib/clustering.h
@ -5,6 +5,7 @@

 #include "clustering/modularity_clustering.h"
 #include "clustering/chinese_whispers.h"
+#include "clustering/spectral_cluster.h"
 #include "svm/kkmeans.h"

 #endif // DLIB_CLuSTERING_
--- a/dlib/clustering/spectral_cluster.h
+++ b/dlib/clustering/spectral_cluster.h
@ -0,0 +1,78 @@
+// Copyright (C) 2015  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_SPECTRAL_CLUSTEr_H_
+#define DLIB_SPECTRAL_CLUSTEr_H_
+
+#include "spectral_cluster_abstract.h"
+#include <vector>
+#include "../matrix.h"
+#include "../svm/kkmeans.h"
+
+namespace dlib
+{
+    template <
+        typename kernel_type,
+        typename vector_type
+        >
+    std::vector<unsigned long> spectral_cluster (
+        const kernel_type& k,
+        const vector_type& samples,
+        const unsigned long num_clusters
+    )
+    {
+        DLIB_CASSERT(num_clusters > 0, 
+            "\t std::vector<unsigned long> spectral_cluster(k,samples,num_clusters)"
+            << "\n\t num_clusters can't be 0."
+            );
+
+        if (num_clusters == 1)
+        {
+            // nothing to do, just assign everything to the 0 cluster.
+            return std::vector<unsigned long>(samples.size(), 0);
+        }
+
+        // compute the similarity matrix.
+        matrix<double> K(samples.size(), samples.size());
+        for (long r = 0; r < K.nr(); ++r)
+            for (long c = r+1; c < K.nc(); ++c)
+                K(r,c) = K(c,r) = (double)k(samples[r], samples[c]);
+        for (long r = 0; r < K.nr(); ++r)
+            K(r,r) = 0;
+
+        matrix<double,0,1> D(K.nr());
+        for (long r = 0; r < K.nr(); ++r)
+            D(r) = sum(rowm(K,r));
+        D = sqrt(reciprocal(D));
+        K = diagm(D)*K*diagm(D); 
+        matrix<double> u,w,v;
+        // Use the normal SVD routine unless the matrix is really big, then use the fast
+        // approximate version.
+        if (K.nr() < 1000)
+            svd3(K,u,w,v);
+        else
+            svd_fast(K,u,w,v, num_clusters+100, 5);
+        // Pick out the eigenvectors associated with the largest eigenvalues.
+        rsort_columns(v,w);
+        v = colm(v, range(0,num_clusters-1));
+        // Now build the normalized spectral vectors, one for each input vector.
+        std::vector<matrix<double,0,1> > spec_samps, centers;
+        for (long r = 0; r < v.nr(); ++r)
+        {
+            spec_samps.push_back(trans(rowm(v,r)));
+            spec_samps.back() /= length(spec_samps.back());
+        }
+        // Finally do the K-means clustering
+        pick_initial_centers(num_clusters, centers, spec_samps);
+        find_clusters_using_kmeans(spec_samps, centers);
+        // And then compute the cluster assignments based on the output of K-means.
+        std::vector<unsigned long> assignments;
+        for (unsigned long i = 0; i < spec_samps.size(); ++i)
+            assignments.push_back(nearest_center(centers, spec_samps[i]));
+
+        return assignments;
+    }
+
+}
+
+#endif // DLIB_SPECTRAL_CLUSTEr_H_
+
--- a/dlib/clustering/spectral_cluster_abstract.h
+++ b/dlib/clustering/spectral_cluster_abstract.h
@ -0,0 +1,43 @@
+// Copyright (C) 2015  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#undef DLIB_SPECTRAL_CLUSTEr_ABSTRACT_H_
+#ifdef DLIB_SPECTRAL_CLUSTEr_ABSTRACT_H_
+
+#include <vector>
+
+namespace dlib
+{
+    template <
+        typename kernel_type,
+        typename vector_type
+        >
+    std::vector<unsigned long> spectral_cluster (
+        const kernel_type& k,
+        const vector_type& samples,
+        const unsigned long num_clusters
+    );
+    /*!
+        requires
+            - samples must be something with an interface compatible with std::vector.
+            - The following expression must evaluate to a double or float:
+                k(samples[i], samples[j])
+            - num_clusters > 0
+        ensures
+            - Performs the spectral clustering algorithm described in the paper: 
+              On spectral clustering: Analysis and an algorithm by Ng, Jordan, and Weiss.
+              and returns the results.
+            - This function clusters the input data samples into num_clusters clusters and
+              returns a vector that indicates which cluster each sample falls into.  In
+              particular, we return an array A such that:
+                - A.size() == samples.size()
+                - A[i] == the cluster assignment of samples[i].
+                - for all valid i: 0 <= A[i] < num_clusters 
+            - The "similarity" of samples[i] with samples[j] is given by
+              k(samples[i],samples[j]).  This means that k() should output a number >= 0
+              and the number should be larger for samples that are more similar.
+    !*/
+}
+
+#endif // DLIB_SPECTRAL_CLUSTEr_ABSTRACT_H_
+
+