diff --git a/dlib/lsh/create_random_projection_hash.h b/dlib/lsh/create_random_projection_hash.h index 9e2c05cf1..525147168 100644 --- a/dlib/lsh/create_random_projection_hash.h +++ b/dlib/lsh/create_random_projection_hash.h @@ -8,6 +8,7 @@ #include "../matrix.h" #include "../rand.h" #include "../statistics.h" +#include "../svm.h" #include namespace dlib @@ -113,6 +114,87 @@ namespace dlib return projection_hash(proj, offset); } +// ---------------------------------------------------------------------------------------- + + template < + typename vector_type + > + projection_hash create_max_margin_projection_hash ( + const vector_type& v, + const int bits, + const double C = 10 + ) + { + // make sure requires clause is not broken + DLIB_ASSERT(0 < bits && bits <= 32 && + v.size() > 1, + "\t projection_hash create_max_margin_projection_hash()" + << "\n\t Invalid arguments were given to this function." + << "\n\t bits: " << bits + << "\n\t v.size(): " << v.size() + ); + +#ifdef ENABLE_ASSERTS + for (unsigned long i = 0; i < v.size(); ++i) + { + DLIB_ASSERT(v[0].size() == v[i].size() && v[i].size() > 0 && is_col_vector(v[i]), + "\t projection_hash create_max_margin_projection_hash()" + << "\n\t Invalid arguments were given to this function." + << "\n\t m(0).size(): " << v[0].size() + << "\n\t m("< > rc; + for (unsigned long i = 0; i < v.size(); ++i) + rc.add(matrix_cast(v[i])); + + // compute a whitening matrix + matrix whiten = trans(chol(pinv(rc.covariance()))); + const matrix meanval = whiten*rc.mean(); + + dlib::rand rnd; + + + typedef matrix sample_type; + random_subset_selector training_samples; + random_subset_selector training_labels; + // We set this up to use enough samples to cover the vector space used by elements + // of v. + training_samples.set_max_size(v[0].size()*10); + training_labels.set_max_size(v[0].size()*10); + + matrix proj(bits, v[0].size()); + matrix offset(bits); + + // learn the random planes and put them into proj and offset. + for (int itr = 0; itr < offset.size(); ++itr) + { + training_samples.make_empty(); + training_labels.make_empty(); + // pick random training data and give each sample a random label. + for (unsigned long i = 0; i < v.size(); ++i) + { + training_samples.add(whiten*v[i]-meanval); + if (rnd.get_random_double() > 0.5) + training_labels.add(+1); + else + training_labels.add(-1); + } + + svm_c_linear_dcd_trainer > trainer; + trainer.set_c(C); + decision_function > df = trainer.train(training_samples, training_labels); + offset(itr) = -df.b; + set_rowm(proj,itr) = trans(df.basis_vectors(0)); + } + + + return projection_hash(proj*whiten, offset-proj*meanval); + } + // ---------------------------------------------------------------------------------------- } diff --git a/dlib/lsh/create_random_projection_hash_abstract.h b/dlib/lsh/create_random_projection_hash_abstract.h index 0cccebdd2..8a1ec6161 100644 --- a/dlib/lsh/create_random_projection_hash_abstract.h +++ b/dlib/lsh/create_random_projection_hash_abstract.h @@ -35,7 +35,43 @@ namespace dlib - H.num_hash_bins() == pow(2,bits) - H will be setup so that it hashes the contents of v such that each bin ends up with roughly the same number of elements - in it. + in it. This is accomplished by picking random hyperplanes + passing though the data. + !*/ + +// ---------------------------------------------------------------------------------------- + + template < + typename vector_type + > + projection_hash create_max_margin_projection_hash ( + const vector_type& v, + const int bits, + const double C = 10 + ); + /*! + requires + - 0 < bits <= 32 + - v.size() > 1 + - vector_type == a std::vector or compatible type containing dlib::matrix + objects, each representing a column vector of the same size. + - for all valid i, j: + - is_col_vector(v[i]) == true + - v[i].size() > 0 + - v[i].size() == v[j].size() + - i.e. v contains only column vectors and all the column vectors + have the same non-zero length + ensures + - returns a hash function H such that: + - H.num_hash_bins() == pow(2,bits) + - H will be setup so that it hashes the contents of v such that + each bin ends up with roughly the same number of elements + in it. This is accomplished using a variation on the random hyperplane + generation technique from the paper: + Random Maximum Margin Hashing by Alexis Joly and Olivier Buisson + In particular, we use the svm_c_linear_dcd_trainer to generate planes. + We train it on randomly selected and randomly labeled points from v. + The C SVM parameter is set to the given C argument. !*/ // ----------------------------------------------------------------------------------------