diff --git a/dlib/svm.h b/dlib/svm.h index c562b3af9..aa894268b 100644 --- a/dlib/svm.h +++ b/dlib/svm.h @@ -19,6 +19,7 @@ #include "svm/roc_trainer.h" #include "svm/kernel_matrix.h" #include "svm/empirical_kernel_map.h" +#include "svm/svm_c_linear_trainer.h" #endif // DLIB_SVm_HEADER diff --git a/dlib/svm/svm_c_linear_trainer.h b/dlib/svm/svm_c_linear_trainer.h new file mode 100644 index 000000000..4ec46efb5 --- /dev/null +++ b/dlib/svm/svm_c_linear_trainer.h @@ -0,0 +1,596 @@ +// Copyright (C) 2010 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#ifndef DLIB_SVM_C_LiNEAR_TRAINER_H__ +#define DLIB_SVM_C_LiNEAR_TRAINER_H__ + +#include "svm_c_linear_trainer_abstract.h" +#include "../algs.h" +#include "../optimization.h" +#include "../matrix.h" +#include "function.h" +#include "kernel.h" +#include +#include + +namespace dlib +{ + +// ---------------------------------------------------------------------------------------- + + template < + typename matrix_type, + typename in_sample_vector_type, + typename in_scalar_vector_type + > + class oca_problem_c_svm : public oca_problem + { + public: + /* + This class is used as part of the implementation of the svm_c_linear_trainer + defined towards the end of this file. + + + The bias parameter is dealt with by imagining that each sample vector has -1 + as its last element. + */ + + typedef typename matrix_type::type scalar_type; + + oca_problem_c_svm( + const scalar_type C_pos, + const scalar_type C_neg, + const in_sample_vector_type& samples_, + const in_scalar_vector_type& labels_, + bool be_verbose_ + ) : + samples(samples_), + labels(labels_), + Cpos(C_pos), + Cneg(C_neg), + be_verbose(be_verbose_) + { + dot_prods.resize(samples.size()); + is_first_call = true; + } + + virtual scalar_type get_c ( + ) const + { + return 1; + } + + virtual long get_num_dimensions ( + ) const + { + // plus 1 for the bias term + return num_dimensions_in_samples(samples) + 1; + } + + virtual void optimization_status ( + scalar_type current_objective_value, + scalar_type current_error_gap, + unsigned long num_cutting_planes + ) const + { + if (be_verbose) + { + using namespace std; + cout << "svm objective: " << current_objective_value << endl; + cout << "gap: " << current_error_gap << endl; + cout << "num planes: " << num_cutting_planes << endl; + cout << endl; + } + } + + virtual bool r_has_lower_bound ( + scalar_type& lower_bound + ) const + { + lower_bound = 0; + return true; + } + + virtual void get_risk ( + matrix_type& w, + scalar_type& risk, + matrix_type& subgradient + ) const + { + line_search(w); + + subgradient.set_size(w.size(),1); + subgradient = 0; + risk = 0; + + + // loop over all the samples and compute the risk and its subgradient at the current solution point w + for (long i = 0; i < samples.size(); ++i) + { + // multiply current SVM output for the ith sample by its label + const scalar_type df_val = labels(i)*dot_prods[i]; + + if (labels(i) > 0) + risk += Cpos*std::max(0.0,1 - df_val); + else + risk += Cneg*std::max(0.0,1 - df_val); + + if (df_val < 1) + { + if (labels(i) > 0) + { + subtract_from(subgradient, samples(i), Cpos); + + subgradient(subgradient.size()-1) += Cpos; + } + else + { + add_to(subgradient, samples(i), Cneg); + + subgradient(subgradient.size()-1) -= Cneg; + } + } + } + + scalar_type scale = 1.0/samples.size(); + + risk *= scale; + subgradient = scale*subgradient; + } + + private: + + // ----------------------------------------------------- + // ----------------------------------------------------- + + // The next few functions are overloads to handle both dense and sparse vectors + template + inline void add_to ( + matrix_type& subgradient, + const matrix_exp& sample, + const scalar_type& C + ) const + { + for (long r = 0; r < sample.size(); ++r) + subgradient(r) += C*sample(r); + } + + template + inline typename disable_if >::type add_to ( + matrix_type& subgradient, + const T& sample, + const scalar_type& C + ) const + { + for (unsigned long i = 0; i < sample.size(); ++i) + subgradient(sample[i].first) += C*sample[i].second; + } + + template + inline void subtract_from ( + matrix_type& subgradient, + const matrix_exp& sample, + const scalar_type& C + ) const + { + for (long r = 0; r < sample.size(); ++r) + subgradient(r) -= C*sample(r); + } + + template + inline typename disable_if >::type subtract_from ( + matrix_type& subgradient, + const T& sample, + const scalar_type& C + ) const + { + for (unsigned long i = 0; i < sample.size(); ++i) + subgradient(sample[i].first) -= C*sample[i].second; + } + + template + scalar_type dot_helper ( + const matrix_type& w, + const matrix_exp& sample + ) const + { + return dot(colm(w,0,w.size()-1), sample); + } + + template + typename disable_if,scalar_type >::type dot_helper ( + const matrix_type& w, + const T& sample + ) const + { + // compute a dot product between a dense column vector and a sparse vector + scalar_type temp = 0; + for (unsigned long i = 0; i < sample.size(); ++i) + temp += w(sample[i].first) * sample[i].second; + } + + template + typename enable_if,unsigned long>::type num_dimensions_in_samples ( + const T& samples + ) const + { + if (samples.size() > 0) + return samples(0).size(); + else + return 0; + } + + template + typename disable_if,unsigned long>::type num_dimensions_in_samples ( + const T& samples + ) const + /*! + T must be a sparse vector with an integral key type + !*/ + { + // these should be sparse samples so look over all them to find the max dimension. + unsigned long max_dim = 0; + for (long i = 0; i < samples.size(); ++i) + { + if (samples(i).size() > 0) + max_dim = std::max(max_dim, samples(i).back().first + 1); + } + + return max_dim; + } + + // ----------------------------------------------------- + // ----------------------------------------------------- + + void line_search ( + matrix_type& w + ) const + /*! + ensures + - does a line search to find a better w + - for all i: #dot_prods[i] == dot(colm(#w,0,w.size()-1), samples(i)) - #w(w.size()-1) + !*/ + { + const scalar_type mu = 0.1; + + for (long i = 0; i < samples.size(); ++i) + dot_prods[i] = dot_helper(w,samples(i)) - w(w.size()-1); + + + if (is_first_call) + { + is_first_call = false; + best_so_far = w; + dot_prods_best = dot_prods; + } + else + { + // do line search going from best_so_far to w. Store results in w. + // Here we use the line search algorithm presented in section 3.1.1 of Franc and Sonnenburg. + + const scalar_type A0 = length_squared(best_so_far - w); + const scalar_type B0 = dot(best_so_far, w - best_so_far); + + const scalar_type scale_pos = (get_c()*Cpos)/samples.size(); + const scalar_type scale_neg = (get_c()*Cneg)/samples.size(); + + ks.clear(); + ks.reserve(samples.size()); + + scalar_type f0 = B0; + for (long i = 0; i < samples.size(); ++i) + { + const scalar_type& scale = (labels(i)>0) ? scale_pos : scale_neg; + + const scalar_type B = scale*labels(i) * ( dot_prods_best[i] - dot_prods[i]); + const scalar_type C = scale*(1 - labels(i)* dot_prods_best[i]); + // Note that if B is 0 then it doesn't matter what k is set to. So 0 is fine. + scalar_type k = 0; + if (B != 0) + k = -C/B; + + if (k > 0) + ks.push_back(helper(k, std::abs(B))); + + if ( (B < 0 && k > 0) || (B > 0 && k <= 0) ) + f0 += B; + } + + // ks.size() == 0 shouldn't happen but check anyway + if (f0 >= 0 || ks.size() == 0) + { + // getting here means that we aren't searching in a descent direction. So don't + // move the best_so_far position. + } + else + { + std::sort(ks.begin(), ks.end()); + + // figure out where f0 goes positive. + scalar_type opt_k = 1; + for (unsigned long i = 0; i < ks.size(); ++i) + { + f0 += ks[i].B; + if (f0 + A0*ks[i].k >= 0) + { + opt_k = ks[i].k; + break; + } + } + + // take the step suggested by the line search + best_so_far = (1-opt_k)*best_so_far + opt_k*w; + + // update best_so_far dot products + for (unsigned long i = 0; i < dot_prods_best.size(); ++i) + dot_prods_best[i] = (1-opt_k)*dot_prods_best[i] + opt_k*dot_prods[i]; + } + + // Put the best_so_far point into w but also take a little bit of w as well. We do + // this since it is possible that some steps won't advance the best_so_far point. + // So this ensures we always make some progress each iteration. + w = (1-mu)*best_so_far + mu*w; + + // update dot products + for (unsigned long i = 0; i < dot_prods.size(); ++i) + dot_prods[i] = (1-mu)*dot_prods_best[i] + mu*dot_prods[i]; + } + } + + struct helper + { + helper(scalar_type k_, scalar_type B_) : k(k_), B(B_) {} + scalar_type k; + scalar_type B; + + bool operator< (const helper& item) const { return k < item.k; } + }; + + mutable std::vector ks; + + mutable bool is_first_call; + mutable std::vector dot_prods; + + mutable matrix_type best_so_far; // best w seen so far + mutable std::vector dot_prods_best; // dot products between best_so_far and samples + + + const in_sample_vector_type& samples; + const in_scalar_vector_type& labels; + const scalar_type Cpos; + const scalar_type Cneg; + + bool be_verbose; + }; + + + template < + typename matrix_type, + typename in_sample_vector_type, + typename in_scalar_vector_type, + typename scalar_type + > + oca_problem_c_svm make_oca_problem_c_svm ( + const scalar_type C_pos, + const scalar_type C_neg, + const in_sample_vector_type& samples, + const in_scalar_vector_type& labels, + bool be_verbose + ) + { + return oca_problem_c_svm(C_pos, C_neg, samples, labels, be_verbose); + } +// ---------------------------------------------------------------------------------------- + + template < + typename K + > + class svm_c_linear_trainer + { + /*! + REQUIREMENTS ON K + is either linear_kernel or sparse_linear_kernel + + WHAT THIS OBJECT REPRESENTS + !*/ + + public: + typedef K kernel_type; + typedef typename kernel_type::scalar_type scalar_type; + typedef typename kernel_type::sample_type sample_type; + typedef typename kernel_type::mem_manager_type mem_manager_type; + typedef decision_function trained_function_type; + + svm_c_linear_trainer ( + ) + /*! + ensures + - This object is properly initialized and ready to be used + to train a support vector machine. + - #get_oca() == oca() (i.e. an instance of oca with default parameters) + - #get_c_class1() == 1 + - #get_c_class2() == 1 + !*/ + { + Cpos = 1; + Cneg = 1; + } + + explicit svm_c_linear_trainer ( + const scalar_type& C + ) + /*! + requires + - C > 0 + ensures + - This object is properly initialized and ready to be used + to train a support vector machine. + - #get_oca() == oca() (i.e. an instance of oca with default parameters) + - #get_c_class1() == C + - #get_c_class2() == C + !*/ + { + Cpos = C; + Cneg = C; + } + + void set_oca ( + const oca& item + ) + /*! + ensures + - #get_oca() == item + !*/ + { + solver = item; + } + + const oca get_oca ( + ) const + /*! + ensures + - returns a copy of the optimizer used to solve the SVM problem. + !*/ + { + return solver; + } + + const kernel_type get_kernel ( + ) const + /*! + ensures + - returns a copy of the kernel function in use by this object + !*/ + { + return kernel_type(); + } + + void set_c ( + scalar_type C + ) + /*! + requires + - C > 0 + ensures + - #get_c_class1() == C + - #get_c_class2() == C + !*/ + { + Cpos = C; + Cneg = C; + } + + const scalar_type get_c_class1 ( + ) const + /*! + ensures + - returns the SVM regularization parameter for the +1 class. + It is the parameter that determines the trade off between + trying to fit the +1 training data exactly or allowing more errors + but hopefully improving the generalization ability of the + resulting classifier. Larger values encourage exact fitting + while smaller values of C may encourage better generalization. + !*/ + { + return Cpos; + } + + const scalar_type get_c_class2 ( + ) const + /*! + ensures + - returns the SVM regularization parameter for the -1 class. + It is the parameter that determines the trade off between + trying to fit the -1 training data exactly or allowing more errors + but hopefully improving the generalization ability of the + resulting classifier. Larger values encourage exact fitting + while smaller values of C may encourage better generalization. + !*/ + { + return Cneg; + } + + void set_c_class1 ( + scalar_type C + ) + /*! + requires + - C > 0 + ensures + - #get_c_class1() == C + !*/ + { + Cpos = C; + } + + void set_c_class2 ( + scalar_type C + ) + /*! + requires + - C > 0 + ensures + - #get_c_class2() == C + !*/ + { + Cneg = C; + } + + template < + typename in_sample_vector_type, + typename in_scalar_vector_type + > + const decision_function train ( + const in_sample_vector_type& x, + const in_scalar_vector_type& y + ) const + /*! + requires + - is_binary_classification_problem(x,y) == true + - x == a matrix or something convertible to a matrix via vector_to_matrix(). + Also, x should contain sample_type objects. + - y == a matrix or something convertible to a matrix via vector_to_matrix(). + Also, y should contain scalar_type objects. + ensures + - trains a C support vector classifier given the training samples in x and + labels in y. + - returns a decision function F with the following properties: + - if (new_x is a sample predicted have +1 label) then + - F(new_x) >= 0 + - else + - F(new_x) < 0 + !*/ + { + typedef matrix w_type; + w_type w; + + scalar_type obj = solver(make_oca_problem_c_svm(Cpos, Cneg, vector_to_matrix(x), vector_to_matrix(y), true), w); + + std::cout << "final obj: "<< obj << std::endl; + + // put the solution into a decision function and then return it + decision_function df; + df.b = static_cast(w(w.size()-1)); + df.basis_vectors.set_size(1); + df.basis_vectors(0) = matrix_cast(colm(w, 0, w.size()-1)); + df.alpha.set_size(1); + df.alpha(0) = 1; + + return df; + } + + private: + + scalar_type Cpos; + scalar_type Cneg; + oca solver; + }; + +// ---------------------------------------------------------------------------------------- + +} + +// ---------------------------------------------------------------------------------------- + + +#endif // DLIB_OCA_PROBLeM_SVM_C_H__ + diff --git a/dlib/svm/svm_c_linear_trainer_abstract.h b/dlib/svm/svm_c_linear_trainer_abstract.h new file mode 100644 index 000000000..f3c6bf056 --- /dev/null +++ b/dlib/svm/svm_c_linear_trainer_abstract.h @@ -0,0 +1,12 @@ +// Copyright (C) 2010 Davis E. King (davis@dlib.net) +// License: Boost Software License See LICENSE.txt for the full license. +#undef DLIB_SVM_C_LiNEAR_TRAINER_ABSTRACT_H__ +#ifdef DLIB_SVM_C_LiNEAR_TRAINER_ABSTRACT_H__ + +namespace dlib +{ + +} + +#endif // DLIB_SVM_C_LiNEAR_TRAINER_ABSTRACT_H__ +