mirror of https://github.com/davisking/dlib.git
Added bottom_up_cluster()
This commit is contained in:
parent
c9c3fa17b5
commit
a2c8aab219
|
@ -6,6 +6,7 @@
|
|||
#include "clustering/modularity_clustering.h"
|
||||
#include "clustering/chinese_whispers.h"
|
||||
#include "clustering/spectral_cluster.h"
|
||||
#include "clustering/bottom_up_cluster.h"
|
||||
#include "svm/kkmeans.h"
|
||||
|
||||
#endif // DLIB_CLuSTERING_
|
||||
|
|
|
@ -0,0 +1,138 @@
|
|||
// Copyright (C) 2015 Davis E. King (davis@dlib.net)
|
||||
// License: Boost Software License See LICENSE.txt for the full license.
|
||||
#ifndef DLIB_BOTTOM_uP_CLUSTER_Hh_
|
||||
#define DLIB_BOTTOM_uP_CLUSTER_Hh_
|
||||
|
||||
#include <queue>
|
||||
#include <map>
|
||||
|
||||
#include "bottom_up_cluster_abstract.h"
|
||||
#include "../algs.h"
|
||||
#include "../matrix.h"
|
||||
#include "../disjoint_subsets.h"
|
||||
#include "../graph_utils.h"
|
||||
|
||||
|
||||
namespace dlib
|
||||
{
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
namespace buc_impl
|
||||
{
|
||||
inline void merge_sets (
|
||||
matrix<double>& dists,
|
||||
unsigned long dest,
|
||||
unsigned long src
|
||||
)
|
||||
{
|
||||
for (long r = 0; r < dists.nr(); ++r)
|
||||
dists(dest,r) = dists(r,dest) = std::max(dists(r,dest), dists(r,src));
|
||||
}
|
||||
|
||||
struct compare_dist
|
||||
{
|
||||
bool operator() (
|
||||
const sample_pair& a,
|
||||
const sample_pair& b
|
||||
) const
|
||||
{
|
||||
return a.distance() > b.distance();
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
template <
|
||||
typename EXP
|
||||
>
|
||||
unsigned long bottom_up_cluster (
|
||||
const matrix_exp<EXP>& dists_,
|
||||
std::vector<unsigned long>& labels,
|
||||
unsigned long min_num_clusters,
|
||||
double max_dist = std::numeric_limits<double>::infinity()
|
||||
)
|
||||
{
|
||||
matrix<double> dists = matrix_cast<double>(dists_);
|
||||
// make sure requires clause is not broken
|
||||
DLIB_CASSERT(dists.nr() == dists.nc() && min_num_clusters > 0,
|
||||
"\t unsigned long bottom_up_cluster()"
|
||||
<< "\n\t Invalid inputs were given to this function."
|
||||
<< "\n\t dists.nr(): " << dists.nr()
|
||||
<< "\n\t dists.nc(): " << dists.nc()
|
||||
<< "\n\t min_num_clusters: " << min_num_clusters
|
||||
);
|
||||
|
||||
using namespace buc_impl;
|
||||
|
||||
labels.resize(dists.nr());
|
||||
disjoint_subsets sets;
|
||||
sets.set_size(dists.nr());
|
||||
if (labels.size() == 0)
|
||||
return 0;
|
||||
|
||||
// push all the edges in the graph into a priority queue so the best edges to merge
|
||||
// come first.
|
||||
std::priority_queue<sample_pair, std::vector<sample_pair>, compare_dist> que;
|
||||
for (long r = 0; r < dists.nr(); ++r)
|
||||
for (long c = r+1; c < dists.nc(); ++c)
|
||||
que.push(sample_pair(r,c,dists(r,c)));
|
||||
|
||||
// Now start merging nodes.
|
||||
for (unsigned long iter = min_num_clusters; iter < sets.size(); ++iter)
|
||||
{
|
||||
// find the next best thing to merge.
|
||||
double best_dist = que.top().distance();
|
||||
unsigned long a = sets.find_set(que.top().index1());
|
||||
unsigned long b = sets.find_set(que.top().index2());
|
||||
que.pop();
|
||||
// we have been merging and modifying the distances, so make sure this distance
|
||||
// is still valid and these guys haven't been merged already.
|
||||
while(a == b || best_dist < dists(a,b))
|
||||
{
|
||||
// Haven't merged it yet, so put it back in with updated distance for
|
||||
// reconsideration later.
|
||||
if (a != b)
|
||||
que.push(sample_pair(a, b, dists(a, b)));
|
||||
|
||||
best_dist = que.top().distance();
|
||||
a = sets.find_set(que.top().index1());
|
||||
b = sets.find_set(que.top().index2());
|
||||
que.pop();
|
||||
}
|
||||
|
||||
|
||||
// now merge these sets if the best distance is small enough
|
||||
if (best_dist > max_dist)
|
||||
break;
|
||||
unsigned long news = sets.merge_sets(a,b);
|
||||
unsigned long olds = (news==a)?b:a;
|
||||
merge_sets(dists, news, olds);
|
||||
}
|
||||
|
||||
// figure out which cluster each element is in. Also make sure the labels are
|
||||
// contiguous.
|
||||
std::map<unsigned long, unsigned long> relabel;
|
||||
for (unsigned long r = 0; r < labels.size(); ++r)
|
||||
{
|
||||
unsigned long l = sets.find_set(r);
|
||||
// relabel to make contiguous
|
||||
if (relabel.count(l) == 0)
|
||||
{
|
||||
unsigned long next = relabel.size();
|
||||
relabel[l] = next;
|
||||
}
|
||||
labels[r] = relabel[l];
|
||||
}
|
||||
|
||||
|
||||
return relabel.size();
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
}
|
||||
|
||||
#endif // DLIB_BOTTOM_uP_CLUSTER_Hh_
|
||||
|
|
@ -0,0 +1,51 @@
|
|||
// Copyright (C) 2015 Davis E. King (davis@dlib.net)
|
||||
// License: Boost Software License See LICENSE.txt for the full license.
|
||||
#undef DLIB_BOTTOM_uP_CLUSTER_ABSTRACT_Hh_
|
||||
#ifdef DLIB_BOTTOM_uP_CLUSTER_ABSTRACT_Hh_
|
||||
|
||||
#include "../matrix.h"
|
||||
|
||||
namespace dlib
|
||||
{
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
template <
|
||||
typename EXP
|
||||
>
|
||||
unsigned long bottom_up_cluster (
|
||||
const matrix_exp<EXP>& dists,
|
||||
std::vector<unsigned long>& labels,
|
||||
unsigned long min_num_clusters,
|
||||
double max_dist = std::numeric_limits<double>::infinity()
|
||||
);
|
||||
/*!
|
||||
requires
|
||||
- dists.nr() == dists.nc()
|
||||
- min_num_clusters > 0
|
||||
- dists == trans(dists)
|
||||
(l.e. dists should be symmetric)
|
||||
ensures
|
||||
- Runs a bottom up agglomerative clustering algorithm.
|
||||
- Interprets dists as a matrix that gives the distances between dists.nr()
|
||||
items. In particular, we take dists(i,j) to be the distance between the ith
|
||||
and jth element of some set. This function clusters the elements of this set
|
||||
into at least min_num_clusters (or dists.nr() if there aren't enough
|
||||
elements). Additionally, within each cluster, the maximum pairwise distance
|
||||
between any two cluster elements is <= max_dist.
|
||||
- returns the number of clusters found.
|
||||
- #labels.size() == dists.nr()
|
||||
- for all valid i:
|
||||
- #labels[i] == the cluster ID of the node with index i (i.e. the node
|
||||
corresponding to the distances dists(i,*)).
|
||||
- 0 <= #labels[i] < the number of clusters found
|
||||
(i.e. cluster IDs are assigned contiguously and start at 0)
|
||||
!*/
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
}
|
||||
|
||||
#endif // DLIB_BOTTOM_uP_CLUSTER_ABSTRACT_Hh_
|
||||
|
||||
|
|
@ -220,6 +220,89 @@ namespace
|
|||
}
|
||||
}
|
||||
|
||||
void test_bottom_up_clustering()
|
||||
{
|
||||
std::vector<dpoint> pts;
|
||||
pts.push_back(dpoint(0.0,0.0));
|
||||
pts.push_back(dpoint(0.5,0.0));
|
||||
pts.push_back(dpoint(0.5,0.5));
|
||||
pts.push_back(dpoint(0.0,0.5));
|
||||
|
||||
pts.push_back(dpoint(3.0,3.0));
|
||||
pts.push_back(dpoint(3.5,3.0));
|
||||
pts.push_back(dpoint(3.5,3.5));
|
||||
pts.push_back(dpoint(3.0,3.5));
|
||||
|
||||
pts.push_back(dpoint(7.0,7.0));
|
||||
pts.push_back(dpoint(7.5,7.0));
|
||||
pts.push_back(dpoint(7.5,7.5));
|
||||
pts.push_back(dpoint(7.0,7.5));
|
||||
|
||||
matrix<double> dists(pts.size(), pts.size());
|
||||
for (long r = 0; r < dists.nr(); ++r)
|
||||
for (long c = 0; c < dists.nc(); ++c)
|
||||
dists(r,c) = length(pts[r]-pts[c]);
|
||||
|
||||
|
||||
matrix<unsigned long,0,1> truth(12);
|
||||
truth = 0, 0, 0, 0,
|
||||
1, 1, 1, 1,
|
||||
2, 2, 2, 2;
|
||||
|
||||
std::vector<unsigned long> labels;
|
||||
DLIB_TEST(bottom_up_cluster(dists, labels, 3) == 3);
|
||||
DLIB_TEST(mat(labels) == truth);
|
||||
DLIB_TEST(bottom_up_cluster(dists, labels, 1, 4.0) == 3);
|
||||
DLIB_TEST(mat(labels) == truth);
|
||||
DLIB_TEST(bottom_up_cluster(dists, labels, 1, 4.95) == 2);
|
||||
truth = 0, 0, 0, 0,
|
||||
0, 0, 0, 0,
|
||||
1, 1, 1, 1;
|
||||
DLIB_TEST(mat(labels) == truth);
|
||||
DLIB_TEST(bottom_up_cluster(dists, labels, 1) == 1);
|
||||
truth = 0, 0, 0, 0,
|
||||
0, 0, 0, 0,
|
||||
0, 0, 0, 0;
|
||||
DLIB_TEST(mat(labels) == truth);
|
||||
|
||||
dists.set_size(0,0);
|
||||
DLIB_TEST(bottom_up_cluster(dists, labels, 3) == 0);
|
||||
DLIB_TEST(labels.size() == 0);
|
||||
DLIB_TEST(bottom_up_cluster(dists, labels, 1) == 0);
|
||||
DLIB_TEST(labels.size() == 0);
|
||||
|
||||
dists.set_size(1,1);
|
||||
dists = 1;
|
||||
DLIB_TEST(bottom_up_cluster(dists, labels, 3) == 1);
|
||||
DLIB_TEST(labels.size() == 1);
|
||||
DLIB_TEST(labels[0] == 0);
|
||||
DLIB_TEST(bottom_up_cluster(dists, labels, 1) == 1);
|
||||
DLIB_TEST(labels.size() == 1);
|
||||
DLIB_TEST(labels[0] == 0);
|
||||
DLIB_TEST(bottom_up_cluster(dists, labels, 1, 0) == 1);
|
||||
DLIB_TEST(labels.size() == 1);
|
||||
DLIB_TEST(labels[0] == 0);
|
||||
|
||||
dists.set_size(2,2);
|
||||
dists = 1;
|
||||
DLIB_TEST(bottom_up_cluster(dists, labels, 3) == 2);
|
||||
DLIB_TEST(labels.size() == 2);
|
||||
DLIB_TEST(labels[0] == 0);
|
||||
DLIB_TEST(labels[1] == 1);
|
||||
DLIB_TEST(bottom_up_cluster(dists, labels, 1) == 1);
|
||||
DLIB_TEST(labels.size() == 2);
|
||||
DLIB_TEST(labels[0] == 0);
|
||||
DLIB_TEST(labels[1] == 0);
|
||||
DLIB_TEST(bottom_up_cluster(dists, labels, 1, 1) == 1);
|
||||
DLIB_TEST(labels.size() == 2);
|
||||
DLIB_TEST(labels[0] == 0);
|
||||
DLIB_TEST(labels[1] == 0);
|
||||
DLIB_TEST(bottom_up_cluster(dists, labels, 1, 0.999) == 2);
|
||||
DLIB_TEST(labels.size() == 2);
|
||||
DLIB_TEST(labels[0] == 0);
|
||||
DLIB_TEST(labels[1] == 1);
|
||||
}
|
||||
|
||||
class test_clustering : public tester
|
||||
{
|
||||
public:
|
||||
|
@ -232,6 +315,8 @@ namespace
|
|||
void perform_test (
|
||||
)
|
||||
{
|
||||
test_bottom_up_clustering();
|
||||
|
||||
dlib::rand rnd;
|
||||
|
||||
std::vector<sample_pair> edges;
|
||||
|
|
Loading…
Reference in New Issue