diff --git a/python_examples/svm_struct.py b/python_examples/svm_struct.py index 4a8fe3b64..77c0054fe 100755 --- a/python_examples/svm_struct.py +++ b/python_examples/svm_struct.py @@ -2,9 +2,11 @@ # The contents of this file are in the public domain. See LICENSE_FOR_EXAMPLE_PROGRAMS.txt # # This is an example illustrating the use of the structural SVM solver from the dlib C++ -# Library. This example will briefly introduce it and then walk through an example showing -# how to use it to create a simple multi-class classifier. -# +# Library. Therefore, this example teaches you the central ideas needed to setup a +# structural SVM model for your machine learning problems. To illustrate the process, we +# use dlib's structural SVM solver to learn the parameters of a simple multi-class +# classifier. We first discuss the multi-class classifier model and then walk through +# using the structural svm tools to find the parameters of this classification model. # # COMPILING THE DLIB PYTHON INTERFACE # Dlib comes with a compiled python interface for python 2.7 on MS Windows. If @@ -17,46 +19,244 @@ import dlib + +def main(): + # In this example, we have three types of samples: class 0, 1, or 2. That is, each of + # our sample vectors falls into one of three classes. To keep this example very + # simple, each sample vector is zero everywhere except at one place. The non-zero + # dimension of each vector determines the class of the vector. So for example, the + # first element of samples has a class of 1 because samples[0][1] is the only non-zero + # element of samples[0]. + samples = [[0,2,0], [1,0,0], [0,4,0], [0,0,3]]; + # Since we want to use a machine learning method to learn a 3-class classifier we need + # to record the labels of our samples. Here samples[i] has a class label of labels[i]. + labels = [1,0,1,2] + + # Now that we have some training data we can tell the structural SVM to learn the + # parameters of our 3-class classifier model. The details of this will be explained + # later. For now, just note that it finds the weights (i.e. a vector of real valued + # parameters) such that predict_label(weights, sample) always returns the correct label + # for a sample vector. + problem = three_class_classifier_problem(samples, labels) + weights = dlib.solve_structural_svm_problem(problem) + + # Print the weights and then evaluate predict_label() on each of our training samples. + # Note that the correct label is predicted for each sample. + print weights + for i in range(len(samples)): + print "predicted label for sample[{}]: {}".format(i, predict_label(weights, samples[i])) + +def predict_label(weights, sample): + """Given the 9-dimensional weight vector which defines a 3 class classifier, predict the + class of the given 3-dimensional sample vector. Therefore, the output of this + function is either 0, 1, or 2 (i.e. one of the three possible labels).""" + + # Our 3-class classifier model can be thought of as containing 3 separate linear + # classifiers. So to predict the class of a sample vector we evaluate each of these + # three classifiers and then whatever classifier has the largest output "wins" and + # predicts the label of the sample. This is the popular one-vs-all multi-class + # classifier model. + # + # Keeping this in mind, the code below simply pulls the three separate weight vectors + # out of weights and then evaluates each against sample. The individual classifier + # scores are stored in scores and the highest scoring index is returned as the label. + w0 = weights[0:3] + w1 = weights[3:6] + w2 = weights[6:9] + scores = [dot(w0, sample), dot(w1,sample), dot(w2, sample)] + max_scoring_label = scores.index(max(scores)) + return max_scoring_label + def dot(a, b): "Compute the dot product between the two vectors a and b." return sum(i*j for i,j in zip(a,b)) +########################################################################################### class three_class_classifier_problem: - C = 10 - be_verbose = True - epsilon = 0.0001 + # Now we arrive at the meat of this example program. To use the + # dlib.solve_structural_svm_problem() routine you need to define an object which tells + # the structural SVM solver what to do for your problem. In this example, this is done + # by defining the three_class_classifier_problem object. Before we get into the + # details, we first discuss some background information on structural SVMs. + # + # A structural SVM is a supervised machine learning method for learning to predict + # complex outputs. This is contrasted with a binary classifier which makes only simple + # yes/no predictions. A structural SVM, on the other hand, can learn to predict + # complex outputs such as entire parse trees or DNA sequence alignments. To do this, + # it learns a function F(x,y) which measures how well a particular data sample x + # matches a label y, where a label is potentially a complex thing like a parse tree. + # However, to keep this example program simple we use only a 3 category label output. + # + # At test time, the best label for a new x is given by the y which maximizes F(x,y). + # To put this into the context of the current example, F(x,y) computes the score for a + # given sample and class label. The predicted class label is therefore whatever value + # of y makes F(x,y) the biggest. This is exactly what predict_label() does. That is, + # it computes F(x,0), F(x,1), and F(x,2) and then reports which label has the biggest + # value. + # + # At a high level, a structural SVM can be thought of as searching the parameter space + # of F(x,y) for the set of parameters that make the following inequality true as often + # as possible: + # F(x_i,y_i) > max{over all incorrect labels of x_i} F(x_i, y_incorrect) + # That is, it seeks to find the parameter vector such that F(x,y) always gives the + # highest score to the correct output. To define the structural SVM optimization + # problem precisely, we first introduce some notation: + # - let PSI(x,y) == the joint feature vector for input x and a label y. + # - let F(x,y|w) == dot(w,PSI(x,y)). + # (we use the | notation to emphasize that F() has the parameter vector of + # weights called w) + # - let LOSS(idx,y) == the loss incurred for predicting that the idx-th training + # sample has a label of y. Note that LOSS() should always be >= 0 and should + # become exactly 0 when y is the correct label for the idx-th sample. Moreover, + # it should notionally indicate how bad it is to predict y for the idx'th sample. + # - let x_i == the i-th training sample. + # - let y_i == the correct label for the i-th training sample. + # - The number of data samples is N. + # + # Then the optimization problem solved by a structural SVM using + # dlib.solve_structural_svm_problem() is the following: + # Minimize: h(w) == 0.5*dot(w,w) + C*R(w) + # + # Where R(w) == sum from i=1 to N: 1/N * sample_risk(i,w) + # and sample_risk(i,w) == max over all Y: LOSS(i,Y) + F(x_i,Y|w) - F(x_i,y_i|w) + # and C > 0 + # + # You can think of the sample_risk(i,w) as measuring the degree of error you would make + # when predicting the label of the i-th sample using parameters w. That is, it is zero + # only when the correct label would be predicted and grows larger the more "wrong" the + # predicted output becomes. Therefore, the objective function is minimizing a balance + # between making the weights small (typically this reduces overfitting) and fitting the + # training data. The degree to which you try to fit the data is controlled by the C + # parameter. + # + # For a more detailed introduction to structured support vector machines you should + # consult the following paper: + # Predicting Structured Objects with Support Vector Machines by + # Thorsten Joachims, Thomas Hofmann, Yisong Yue, and Chun-nam Yu + # + + # Finally, we come back to the code. To use dlib.solve_structural_svm_problem() you + # need to provide the things discussed above. This is the value of C, the number of + # training samples, the dimensionality of PSI(), as well as methods for calculating the + # loss values and PSI() vectors. You will also need to write code that can compute: + # max over all Y: LOSS(i,Y) + F(x_i,Y|w). To summarize, the + # three_class_classifier_problem class is required to have the following fields: + # - C + # - num_samples + # - num_dimensions + # - get_truth_joint_feature_vector() + # - separation_oracle() + + C = 1 + + # There are also a number of optional arguments: + # epsilon is the stopping tolerance. The optimizer will run until R(w) is within + # epsilon of its optimal value. If you don't set this then it defaults to 0.001 + #epsilon = 1e-13 + + # Uncomment this and the optimizer will print its progress to standard out. You will + # be able to see things like the current risk gap. The optimizer continues until the + # risk gap is below epsilon. + #be_verbose = True + + # If you want to require that the learned weights are all non-negative then set this + # field to True. + #learns_nonnegative_weights = True + + # The optimizer uses an internal cache to avoid unnecessary calls to your + # separation_oracle() routine. This parameter controls the size of that cache. Bigger + # values use more RAM and might make the optimizer run faster. You can also disable it + # by setting it to 0 which is good to do when your separation_oracle is very fast. + #max_cache_size = 20 def __init__(self, samples, labels): + # dlib.solve_structural_svm_problem() also expects the class to have num_samples + # and num_dimensions fields. These fields are expected to contain the number of + # training samples and the dimensionality of the psi feature vector respectively. self.num_samples = len(samples) self.num_dimensions = len(samples[0])*3 + self.samples = samples self.labels = labels - def make_psi(self, vector, label): + def make_psi(self, x, label): + """Compute PSI(x,label).""" + # All we are doing here is taking x, which is a 3 dimensional sample vector in this + # example program, and putting it into one of 3 places in a 9 dimensional PSI + # vector, which we then return. So this function returns PSI(x,label). To see why + # we setup PSI like this, recall how predict_label() works. It takes in a 9 + # dimensional weight vector and breaks the vector into 3 pieces. Each piece then + # defines a different classifier and we use them in a one-vs-all manner to predict + # the label. So now that we are in the structural SVM code we have to define the + # PSI vector to correspond to this usage. That is, we need to setup PSI so that + # argmax_y dot(weights,PSI(x,y)) == predict_label(weights,x). This is how we tell + # the structural SVM solver what kind of problem we are trying to solve. + # + # It's worth emphasizing that the single biggest step in using a structural SVM is + # deciding how you want to represent PSI(x,label). It is always a vector, but + # deciding what to put into it to solve your problem is often not a trivial task. + # Part of the difficulty is that you need an efficient method for finding the label + # that makes dot(w,PSI(x,label)) the biggest. Sometimes this is easy, but often + # finding the max scoring label turns into a difficult combinatorial optimization + # problem. So you need to pick a PSI that doesn't make the label maximization step + # intractable but also still well models your problem. + + # Create a dense vector object. psi = dlib.vector() + # Set it to have 9 dimensions. Note that the elements of the vector are 0 + # initialized. psi.resize(self.num_dimensions) - dims = len(vector) + dims = len(x) if (label == 0): for i in range(0,dims): - psi[i] = vector[i] + psi[i] = x[i] elif (label == 1): for i in range(dims,2*dims): - psi[i] = vector[i-dims] + psi[i] = x[i-dims] else: # the label must be 2 for i in range(2*dims,3*dims): - psi[i] = vector[i-2*dims] + psi[i] = x[i-2*dims] return psi + # Now we get to the two member functions that are directly called by + # dlib.solve_structural_svm_problem(). + # + # In get_truth_joint_feature_vector(), all you have to do is return the PSI() vector + # for the idx-th training sample when it has its true label. So here it returns + # PSI(self.samples[idx], self.labels[idx]). def get_truth_joint_feature_vector(self, idx): return self.make_psi(self.samples[idx], self.labels[idx]) + # separation_oracle() is more interesting. dlib.solve_structural_svm_problem() will + # call separation_oracle() many times during the optimization. Each time it will give + # it the current value of the parameter weights and the separation_oracle() is supposed + # to find the label that most violates the structural SVM objective function for the + # idx-th sample. Then the separation oracle reports the corresponding PSI vector and + # loss value. # To be more precise, separation_oracle() has the following contract: + # requires + # - 0 <= idx < self.num_samples + # - len(current_solution) == self.num_dimensions + # ensures + # - runs the separation oracle on the idx-th sample. We define this as follows: + # - let X == the idx-th training sample. + # - let PSI(X,y) == the joint feature vector for input X and an arbitrary label y. + # - let F(X,y) == dot(current_solution,PSI(X,y)). + # - let LOSS(idx,y) == the loss incurred for predicting that the idx-th sample + # has a label of y. Note that LOSS() should always be >= 0 and should + # become exactly 0 when y is the correct label for the idx-th sample. + # + # Then the separation oracle finds a Y such that: + # Y = argmax over all y: LOSS(idx,y) + F(X,y) + # (i.e. It finds the label which maximizes the above expression.) + # + # Finally, separation_oracle() returns LOSS(idx,Y),PSI(X,Y) def separation_oracle(self, idx, current_solution): - samp = samples[idx] + samp = self.samples[idx] dims = len(samp) scores = [0,0,0] # compute scores for each of the three classifiers @@ -64,39 +264,30 @@ class three_class_classifier_problem: scores[1] = dot(current_solution[dims:2*dims], samp) scores[2] = dot(current_solution[2*dims:3*dims], samp) - # Add in the loss-augmentation - if (labels[idx] != 0): + # Add in the loss-augmentation. Recall that we maximize LOSS(idx,y) + F(X,y) in + # the separate oracle, not just F(X,y) as we normally would in predict_label(). + if (self.labels[idx] != 0): scores[0] += 1 - if (labels[idx] != 1): + if (self.labels[idx] != 1): scores[1] += 1 - if (labels[idx] != 2): + if (self.labels[idx] != 2): scores[2] += 1 # Now figure out which classifier has the largest loss-augmented score. max_scoring_label = scores.index(max(scores)) - if (max_scoring_label == labels[idx]): + # We incur a loss of 1 if we don't predict the correct label and a loss of 0 if we + # get the right answer. + if (max_scoring_label == self.labels[idx]): loss = 0 else: loss = 1 + # Finally, return the loss and PSI vector corresponding to the label we just found. psi = self.make_psi(samp, max_scoring_label) - return loss,psi -samples = [[0,0,1], [0,1,0], [1,0,0]]; -labels = [0,1,2] - -problem = three_class_classifier_problem(samples, labels) -weights = dlib.solve_structural_svm_problem(problem) -print weights - -w1 = weights[0:3] -w2 = weights[3:6] -w3 = weights[6:9] - -print "scores for class 1 sample: ", dot(w1, samples[0]), dot(w2,samples[0]), dot(w3, samples[0]) -print "scores for class 2 sample: ", dot(w1, samples[1]), dot(w2,samples[1]), dot(w3, samples[1]) -print "scores for class 3 sample: ", dot(w1, samples[2]), dot(w2,samples[2]), dot(w3, samples[2]) +if __name__ == "__main__": + main()