Added bottom_up_cluster()

This commit is contained in:
Davis King 2015-06-23 08:24:48 -04:00
parent c9c3fa17b5
commit a2c8aab219
4 changed files with 275 additions and 0 deletions

View File

@ -6,6 +6,7 @@
#include "clustering/modularity_clustering.h"
#include "clustering/chinese_whispers.h"
#include "clustering/spectral_cluster.h"
#include "clustering/bottom_up_cluster.h"
#include "svm/kkmeans.h"
#endif // DLIB_CLuSTERING_

View File

@ -0,0 +1,138 @@
// Copyright (C) 2015 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#ifndef DLIB_BOTTOM_uP_CLUSTER_Hh_
#define DLIB_BOTTOM_uP_CLUSTER_Hh_
#include <queue>
#include <map>
#include "bottom_up_cluster_abstract.h"
#include "../algs.h"
#include "../matrix.h"
#include "../disjoint_subsets.h"
#include "../graph_utils.h"
namespace dlib
{
// ----------------------------------------------------------------------------------------
namespace buc_impl
{
inline void merge_sets (
matrix<double>& dists,
unsigned long dest,
unsigned long src
)
{
for (long r = 0; r < dists.nr(); ++r)
dists(dest,r) = dists(r,dest) = std::max(dists(r,dest), dists(r,src));
}
struct compare_dist
{
bool operator() (
const sample_pair& a,
const sample_pair& b
) const
{
return a.distance() > b.distance();
}
};
}
// ----------------------------------------------------------------------------------------
template <
typename EXP
>
unsigned long bottom_up_cluster (
const matrix_exp<EXP>& dists_,
std::vector<unsigned long>& labels,
unsigned long min_num_clusters,
double max_dist = std::numeric_limits<double>::infinity()
)
{
matrix<double> dists = matrix_cast<double>(dists_);
// make sure requires clause is not broken
DLIB_CASSERT(dists.nr() == dists.nc() && min_num_clusters > 0,
"\t unsigned long bottom_up_cluster()"
<< "\n\t Invalid inputs were given to this function."
<< "\n\t dists.nr(): " << dists.nr()
<< "\n\t dists.nc(): " << dists.nc()
<< "\n\t min_num_clusters: " << min_num_clusters
);
using namespace buc_impl;
labels.resize(dists.nr());
disjoint_subsets sets;
sets.set_size(dists.nr());
if (labels.size() == 0)
return 0;
// push all the edges in the graph into a priority queue so the best edges to merge
// come first.
std::priority_queue<sample_pair, std::vector<sample_pair>, compare_dist> que;
for (long r = 0; r < dists.nr(); ++r)
for (long c = r+1; c < dists.nc(); ++c)
que.push(sample_pair(r,c,dists(r,c)));
// Now start merging nodes.
for (unsigned long iter = min_num_clusters; iter < sets.size(); ++iter)
{
// find the next best thing to merge.
double best_dist = que.top().distance();
unsigned long a = sets.find_set(que.top().index1());
unsigned long b = sets.find_set(que.top().index2());
que.pop();
// we have been merging and modifying the distances, so make sure this distance
// is still valid and these guys haven't been merged already.
while(a == b || best_dist < dists(a,b))
{
// Haven't merged it yet, so put it back in with updated distance for
// reconsideration later.
if (a != b)
que.push(sample_pair(a, b, dists(a, b)));
best_dist = que.top().distance();
a = sets.find_set(que.top().index1());
b = sets.find_set(que.top().index2());
que.pop();
}
// now merge these sets if the best distance is small enough
if (best_dist > max_dist)
break;
unsigned long news = sets.merge_sets(a,b);
unsigned long olds = (news==a)?b:a;
merge_sets(dists, news, olds);
}
// figure out which cluster each element is in. Also make sure the labels are
// contiguous.
std::map<unsigned long, unsigned long> relabel;
for (unsigned long r = 0; r < labels.size(); ++r)
{
unsigned long l = sets.find_set(r);
// relabel to make contiguous
if (relabel.count(l) == 0)
{
unsigned long next = relabel.size();
relabel[l] = next;
}
labels[r] = relabel[l];
}
return relabel.size();
}
// ----------------------------------------------------------------------------------------
}
#endif // DLIB_BOTTOM_uP_CLUSTER_Hh_

View File

@ -0,0 +1,51 @@
// Copyright (C) 2015 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#undef DLIB_BOTTOM_uP_CLUSTER_ABSTRACT_Hh_
#ifdef DLIB_BOTTOM_uP_CLUSTER_ABSTRACT_Hh_
#include "../matrix.h"
namespace dlib
{
// ----------------------------------------------------------------------------------------
template <
typename EXP
>
unsigned long bottom_up_cluster (
const matrix_exp<EXP>& dists,
std::vector<unsigned long>& labels,
unsigned long min_num_clusters,
double max_dist = std::numeric_limits<double>::infinity()
);
/*!
requires
- dists.nr() == dists.nc()
- min_num_clusters > 0
- dists == trans(dists)
(l.e. dists should be symmetric)
ensures
- Runs a bottom up agglomerative clustering algorithm.
- Interprets dists as a matrix that gives the distances between dists.nr()
items. In particular, we take dists(i,j) to be the distance between the ith
and jth element of some set. This function clusters the elements of this set
into at least min_num_clusters (or dists.nr() if there aren't enough
elements). Additionally, within each cluster, the maximum pairwise distance
between any two cluster elements is <= max_dist.
- returns the number of clusters found.
- #labels.size() == dists.nr()
- for all valid i:
- #labels[i] == the cluster ID of the node with index i (i.e. the node
corresponding to the distances dists(i,*)).
- 0 <= #labels[i] < the number of clusters found
(i.e. cluster IDs are assigned contiguously and start at 0)
!*/
// ----------------------------------------------------------------------------------------
}
#endif // DLIB_BOTTOM_uP_CLUSTER_ABSTRACT_Hh_

View File

@ -220,6 +220,89 @@ namespace
}
}
void test_bottom_up_clustering()
{
std::vector<dpoint> pts;
pts.push_back(dpoint(0.0,0.0));
pts.push_back(dpoint(0.5,0.0));
pts.push_back(dpoint(0.5,0.5));
pts.push_back(dpoint(0.0,0.5));
pts.push_back(dpoint(3.0,3.0));
pts.push_back(dpoint(3.5,3.0));
pts.push_back(dpoint(3.5,3.5));
pts.push_back(dpoint(3.0,3.5));
pts.push_back(dpoint(7.0,7.0));
pts.push_back(dpoint(7.5,7.0));
pts.push_back(dpoint(7.5,7.5));
pts.push_back(dpoint(7.0,7.5));
matrix<double> dists(pts.size(), pts.size());
for (long r = 0; r < dists.nr(); ++r)
for (long c = 0; c < dists.nc(); ++c)
dists(r,c) = length(pts[r]-pts[c]);
matrix<unsigned long,0,1> truth(12);
truth = 0, 0, 0, 0,
1, 1, 1, 1,
2, 2, 2, 2;
std::vector<unsigned long> labels;
DLIB_TEST(bottom_up_cluster(dists, labels, 3) == 3);
DLIB_TEST(mat(labels) == truth);
DLIB_TEST(bottom_up_cluster(dists, labels, 1, 4.0) == 3);
DLIB_TEST(mat(labels) == truth);
DLIB_TEST(bottom_up_cluster(dists, labels, 1, 4.95) == 2);
truth = 0, 0, 0, 0,
0, 0, 0, 0,
1, 1, 1, 1;
DLIB_TEST(mat(labels) == truth);
DLIB_TEST(bottom_up_cluster(dists, labels, 1) == 1);
truth = 0, 0, 0, 0,
0, 0, 0, 0,
0, 0, 0, 0;
DLIB_TEST(mat(labels) == truth);
dists.set_size(0,0);
DLIB_TEST(bottom_up_cluster(dists, labels, 3) == 0);
DLIB_TEST(labels.size() == 0);
DLIB_TEST(bottom_up_cluster(dists, labels, 1) == 0);
DLIB_TEST(labels.size() == 0);
dists.set_size(1,1);
dists = 1;
DLIB_TEST(bottom_up_cluster(dists, labels, 3) == 1);
DLIB_TEST(labels.size() == 1);
DLIB_TEST(labels[0] == 0);
DLIB_TEST(bottom_up_cluster(dists, labels, 1) == 1);
DLIB_TEST(labels.size() == 1);
DLIB_TEST(labels[0] == 0);
DLIB_TEST(bottom_up_cluster(dists, labels, 1, 0) == 1);
DLIB_TEST(labels.size() == 1);
DLIB_TEST(labels[0] == 0);
dists.set_size(2,2);
dists = 1;
DLIB_TEST(bottom_up_cluster(dists, labels, 3) == 2);
DLIB_TEST(labels.size() == 2);
DLIB_TEST(labels[0] == 0);
DLIB_TEST(labels[1] == 1);
DLIB_TEST(bottom_up_cluster(dists, labels, 1) == 1);
DLIB_TEST(labels.size() == 2);
DLIB_TEST(labels[0] == 0);
DLIB_TEST(labels[1] == 0);
DLIB_TEST(bottom_up_cluster(dists, labels, 1, 1) == 1);
DLIB_TEST(labels.size() == 2);
DLIB_TEST(labels[0] == 0);
DLIB_TEST(labels[1] == 0);
DLIB_TEST(bottom_up_cluster(dists, labels, 1, 0.999) == 2);
DLIB_TEST(labels.size() == 2);
DLIB_TEST(labels[0] == 0);
DLIB_TEST(labels[1] == 1);
}
class test_clustering : public tester
{
public:
@ -232,6 +315,8 @@ namespace
void perform_test (
)
{
test_bottom_up_clustering();
dlib::rand rnd;
std::vector<sample_pair> edges;