mirror of https://github.com/davisking/dlib.git
Added an example program for the empirical_kernel_map.
--HG-- extra : convert_revision : svn%3Afdd8eb12-d10e-0410-9acb-85c331704f74/trunk%403352
This commit is contained in:
parent
b6f926e030
commit
6b400564d0
|
@ -35,6 +35,7 @@ add_example(bayes_net_gui_ex)
|
|||
add_example(compress_stream_ex)
|
||||
add_example(config_reader_ex)
|
||||
add_example(dir_nav_ex)
|
||||
add_example(empirical_kernel_map_ex)
|
||||
add_example(file_to_code_ex)
|
||||
add_example(gui_api_ex)
|
||||
add_example(image_ex)
|
||||
|
|
|
@ -0,0 +1,355 @@
|
|||
// The contents of this file are in the public domain. See LICENSE_FOR_EXAMPLE_PROGRAMS.txt
|
||||
/*
|
||||
|
||||
This is an example illustrating the use of the empirical_kernel_map
|
||||
from the dlib C++ Library.
|
||||
|
||||
This example program assumes you are familiar with some general elements of
|
||||
the library. In particular, you should have at least read the svm_ex.cpp
|
||||
and matrix_ex.cpp examples.
|
||||
|
||||
|
||||
Most of the machine learning algorithms in dlib are some flavor of "kernel machine".
|
||||
This means they are all simple linear algorithms that have been formulated such
|
||||
that the only way they look at the data given by a user is via dot products between
|
||||
the data samples. These algorithms are made more useful via the application of the
|
||||
so called kernel trick. This trick is to replace the dot product with a user
|
||||
supplied function which takes two samples and returns a real number. This function
|
||||
is the kernel that is required by so many algorithms. The most basic kernel is the
|
||||
linear_kernel which is simply a normal dot product. However, more interesting
|
||||
kernels first apply some nonlinear transformation to the user's data samples and
|
||||
then compute a dot product. In this way, a simple algorithm that finds a linear
|
||||
plane to separate data (i.e. the SVM algorithm) can be made to solve complex
|
||||
nonlinear learning problems.
|
||||
|
||||
An important element of the kernel trick is that these kernel functions perform
|
||||
the nonlinear transformation implicitly. That is, if you look at the implementations
|
||||
of these kernel functions you won't see code that transforms two input vectors in
|
||||
some way and then computes their dot products. Instead you will see a simple function
|
||||
that takes two input vectors and just computes a single real number via some simple
|
||||
process. You can basically think of this as an optimization. Imagine that originally
|
||||
we wrote out the entire procedure to perform the nonlinear transformation and then
|
||||
compute the dot product but then noticed we could cancel a few terms here and there
|
||||
and simplify the whole thing down into a more compact and easily evaluated form.
|
||||
The result is a nice function that computes what we want but we no longer get to see
|
||||
what those nonlinearly transformed input vectors are.
|
||||
|
||||
The empirical_kernel_map is a tool that undoes this. It allows you to obtain these
|
||||
nonlinearly transformed vectors. It does this by taking a set of data samples from
|
||||
the user (referred to as basis samples), applying the nonlinear transformation to all
|
||||
of them, and then constructing a set of orthonormal basis vectors which spans the space
|
||||
occupied by those transformed input samples. Then if we wish to obtain the nonlinear
|
||||
version of any data sample we can simply project it onto this orthonormal basis and
|
||||
we obtain a regular vector of real numbers which represents the nonlinearly transformed
|
||||
version of the data sample. The empirical_kernel_map has been formulated to use only
|
||||
dot products between data samples so it is capable of performing this service for any
|
||||
user supplied kernel function.
|
||||
|
||||
The empirical_kernel_map is useful because it is often difficult to formulate an
|
||||
algorithm in a way that uses only dot products. So the empirical_kernel_map lets
|
||||
non-experts effectively kernelize any algorithm they like by using this object
|
||||
during a preprocessing step. However, it should be noted that the algorithm
|
||||
is only practical when used with at most a few thousand basis samples. Fortunately,
|
||||
most datasets live in subspaces that are relatively low dimensional. So for these
|
||||
datasets, using the empirical_kernel_map is practical assuming a reasonable set of
|
||||
basis samples can be selected by the user. To help with this dlib supplies the
|
||||
linearly_independent_subset_finder. Some people also find that just picking a random
|
||||
subset of their data and using that as a basis set is fine as well.
|
||||
|
||||
|
||||
|
||||
In what follows, we walk through the process of creating an empirical_kernel_map,
|
||||
projecting data to obtain the nonlinearly transformed vectors, and then doing a
|
||||
few interesting things with the data.
|
||||
*/
|
||||
|
||||
|
||||
|
||||
|
||||
#include "dlib/svm.h"
|
||||
#include "dlib/rand.h"
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
|
||||
using namespace std;
|
||||
using namespace dlib;
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
// First lets make a typedef for the kind of samples we will be using.
|
||||
typedef matrix<double, 0, 1> sample_type;
|
||||
|
||||
// We will be using the radial_basis_kernel in this example program.
|
||||
typedef radial_basis_kernel<sample_type> kernel_type;
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
void generate_concentric_circles (
|
||||
std::vector<sample_type>& samples,
|
||||
std::vector<double>& labels,
|
||||
const int num_points
|
||||
);
|
||||
/*!
|
||||
ensures
|
||||
- generates two circles centered at the point (0,0), one of radius 1 and
|
||||
the other of radius 5. These points are stored into samples. labels will
|
||||
tell you if a given samples is from the smaller circle (its label will be 1)
|
||||
or from the larger circle (its label will be 2).
|
||||
- each circle will be made up of num_points
|
||||
!*/
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
void test_empirical_kernel_map (
|
||||
const std::vector<sample_type>& samples,
|
||||
const std::vector<double>& labels,
|
||||
const empirical_kernel_map<kernel_type>& ekm
|
||||
);
|
||||
/*!
|
||||
This function computes various interesting things with the empirical_kernel_map.
|
||||
See its implementation below for details.
|
||||
!*/
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
int main()
|
||||
{
|
||||
std::vector<sample_type> samples;
|
||||
std::vector<double> labels;
|
||||
|
||||
// Declare an instance of the kernel we will be using.
|
||||
const kernel_type kern(0.1);
|
||||
|
||||
// create a dataset with two concentric circles. There will be 100 points on each circle.
|
||||
generate_concentric_circles(samples, labels, 100);
|
||||
|
||||
empirical_kernel_map<kernel_type> ekm;
|
||||
|
||||
|
||||
// Here we create an empirical_kernel_map using all of our data samples as basis samples.
|
||||
cout << "\n\nBuilding an empirical_kernel_map " << samples.size() << " basis samples." << endl;
|
||||
ekm.load(kern, samples);
|
||||
cout << "Test the empirical_kernel_map when it is loaded with every sample." << endl;
|
||||
test_empirical_kernel_map(samples, labels, ekm);
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
// create a new dataset with two concentric circles. There will be 1000 points on each circle.
|
||||
generate_concentric_circles(samples, labels, 1000);
|
||||
// Rather than using all 2000 samples as basis samples we are going to use the
|
||||
// linearly_independent_subset_finder to pick out 20 good basis samples. The idea behind this
|
||||
// object is to try and find the 20 samples that best span the subspace which contains all the
|
||||
// data.
|
||||
linearly_independent_subset_finder<kernel_type> lisf(kern, 20);
|
||||
// We have to let the linearly_independent_subset_finder look at all the data first.
|
||||
for (unsigned long i = 0; i < samples.size(); ++i)
|
||||
lisf.add(samples[i]);
|
||||
|
||||
// Now reload the empirical_kernel_map but this time using only our 20 basis samples
|
||||
// selected using the linearly_independent_subset_finder.
|
||||
cout << "\n\nBuilding an empirical_kernel_map with " << lisf.dictionary_size() << " basis samples." << endl;
|
||||
ekm.load(kern, lisf.get_dictionary());
|
||||
cout << "Test the empirical_kernel_map when it is loaded with samples from the lisf object." << endl;
|
||||
test_empirical_kernel_map(samples, labels, ekm);
|
||||
|
||||
|
||||
cout << endl;
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
void test_empirical_kernel_map (
|
||||
const std::vector<sample_type>& samples,
|
||||
const std::vector<double>& labels,
|
||||
const empirical_kernel_map<kernel_type>& ekm
|
||||
)
|
||||
{
|
||||
|
||||
std::vector<sample_type> projected_samples;
|
||||
|
||||
// The first thing we do is compute the nonlinearly projected vectors using the
|
||||
// empirical_kernel_map.
|
||||
for (unsigned long i = 0; i < samples.size(); ++i)
|
||||
{
|
||||
projected_samples.push_back(ekm.project(samples[i]));
|
||||
}
|
||||
|
||||
// Note that a kernel matrix is just a matrix M such that M(i,j) == kernel(samples[i],samples[j]).
|
||||
// So below we are computing the normal kernel matrix as given by the radial_basis_kernel and the
|
||||
// input samples. We also compute the kernel matrix for all the projected_samples as given by the
|
||||
// linear_kernel. Note that the linear_kernel just computes normal dot products. So what we want to
|
||||
// see is that the dot products between all the projected_samples samples are the same as the outputs
|
||||
// of the kernel function for their respective untransformed input samples. If they match then
|
||||
// we know that the empirical_kernel_map is working properly.
|
||||
const matrix<double> normal_kernel_matrix = kernel_matrix(ekm.get_kernel(), samples);
|
||||
const matrix<double> new_kernel_matrix = kernel_matrix(linear_kernel<sample_type>(), projected_samples);
|
||||
|
||||
cout << "Max kernel matrix error: " << max(abs(normal_kernel_matrix - new_kernel_matrix)) << endl;
|
||||
cout << "mean kernel matrix error: " << mean(abs(normal_kernel_matrix - new_kernel_matrix)) << endl;
|
||||
/*
|
||||
Example outputs from these cout statements.
|
||||
For the case where we use all samples as basis samples:
|
||||
Max kernel matrix error: 2.73115e-14
|
||||
mean kernel matrix error: 6.19125e-15
|
||||
|
||||
For the case where we use only 20 samples as basis samples:
|
||||
Max kernel matrix error: 0.0154466
|
||||
mean kernel matrix error: 0.000753427
|
||||
|
||||
|
||||
Note that if we use enough basis samples to perfectly span the space of input samples
|
||||
then we get errors that are essentially just rounding noise (Moreover, using all the
|
||||
samples is always enough since they are always within their own span). Once we start
|
||||
to use fewer basis samples we begin to get approximation error since the data doesn't
|
||||
really lay exactly in a 20 dimensional subspace. But it is pretty close.
|
||||
*/
|
||||
|
||||
|
||||
|
||||
// Now lets do something more interesting. The below loop finds the centroids
|
||||
// of the two classes of data.
|
||||
sample_type class1_center(ekm.out_vector_size());
|
||||
sample_type class2_center(ekm.out_vector_size());
|
||||
class1_center = 0;
|
||||
class2_center = 0;
|
||||
for (unsigned long i = 0; i < projected_samples.size(); ++i)
|
||||
{
|
||||
if (labels[i] == 1)
|
||||
class1_center += projected_samples[i];
|
||||
else
|
||||
class2_center += projected_samples[i];
|
||||
}
|
||||
|
||||
const int points_per_class = samples.size()/2;
|
||||
class1_center /= points_per_class;
|
||||
class2_center /= points_per_class;
|
||||
|
||||
|
||||
// Now classify points by which center they are nearest. Recall that the data
|
||||
// is made up of two concentric circles. Normally you can't separate two concentric
|
||||
// circles by checking which points are nearest to each center since they have the same
|
||||
// centers. All the points would just associate to the smallest circle. However,
|
||||
// the kernel trick makes the data separable and the loop below will perfectly
|
||||
// classify each data point.
|
||||
for (unsigned long i = 0; i < projected_samples.size(); ++i)
|
||||
{
|
||||
double distance_to_class1 = length(projected_samples[i] - class1_center);
|
||||
double distance_to_class2 = length(projected_samples[i] - class2_center);
|
||||
|
||||
bool predicted_as_class_1 = (distance_to_class1 < distance_to_class2);
|
||||
|
||||
// Now print a message for any misclassified points.
|
||||
if (predicted_as_class_1 == true && labels[i] != 1)
|
||||
cout << "A point was misclassified" << endl;
|
||||
|
||||
if (predicted_as_class_1 == false && labels[i] != 2)
|
||||
cout << "A point was misclassified" << endl;
|
||||
}
|
||||
|
||||
|
||||
|
||||
// Next, note that classifying a point based on its distance between two other
|
||||
// points is the same thing as using the plane that lies between those two points
|
||||
// as a decision boundary. So lets compute that decision plane and use it to classify
|
||||
// all the points.
|
||||
|
||||
sample_type plane_normal_vector = class1_center - class2_center;
|
||||
// The point right in the center of our two classes should be on the deciding plane, not
|
||||
// on one side or the other. This consideration brings us to the formula for the bias.
|
||||
double bias = dot((class1_center+class2_center)/2, plane_normal_vector);
|
||||
|
||||
// Now classify points by which side of the plane they are on.
|
||||
for (unsigned long i = 0; i < projected_samples.size(); ++i)
|
||||
{
|
||||
double side = dot(plane_normal_vector, projected_samples[i]) - bias;
|
||||
|
||||
bool predicted_as_class_1 = (side > 0);
|
||||
|
||||
// Now print a message for any misclassified points.
|
||||
if (predicted_as_class_1 == true && labels[i] != 1)
|
||||
cout << "A point was misclassified" << endl;
|
||||
|
||||
if (predicted_as_class_1 == false && labels[i] != 2)
|
||||
cout << "A point was misclassified" << endl;
|
||||
}
|
||||
|
||||
|
||||
// It would be nice to convert this decision rule into a normal decision_function object and
|
||||
// dispense with the empirical_kernel_map. Happily, it is possible to do so. Consider the
|
||||
// following example code:
|
||||
decision_function<kernel_type> dec_funct = ekm.convert_to_decision_function(plane_normal_vector);
|
||||
// The dec_funct now computes dot products between plane_normal_vector and the projection
|
||||
// of any sample point given to it. All that remains is to account for the bias.
|
||||
dec_funct.b = bias;
|
||||
|
||||
// now classify points by which side of the plane they are on.
|
||||
for (unsigned long i = 0; i < samples.size(); ++i)
|
||||
{
|
||||
double side = dec_funct(samples[i]);
|
||||
|
||||
// And lets just check that the dec_funct really does compute the same thing as the previous equation.
|
||||
double side_alternate_equation = dot(plane_normal_vector, projected_samples[i]) - bias;
|
||||
if (abs(side-side_alternate_equation) > 1e-14)
|
||||
cout << "dec_funct error: " << abs(side-side_alternate_equation) << endl;
|
||||
|
||||
bool predicted_as_class_1 = (side > 0);
|
||||
|
||||
// Now print a message for any misclassified points.
|
||||
if (predicted_as_class_1 == true && labels[i] != 1)
|
||||
cout << "A point was misclassified" << endl;
|
||||
|
||||
if (predicted_as_class_1 == false && labels[i] != 2)
|
||||
cout << "A point was misclassified" << endl;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
void generate_concentric_circles (
|
||||
std::vector<sample_type>& samples,
|
||||
std::vector<double>& labels,
|
||||
const int num
|
||||
)
|
||||
{
|
||||
sample_type m(2,1);
|
||||
samples.clear();
|
||||
labels.clear();
|
||||
|
||||
dlib::rand::float_1a rnd;
|
||||
|
||||
// make some samples near the origin
|
||||
double radius = 1.0;
|
||||
for (long i = 0; i < num; ++i)
|
||||
{
|
||||
double sign = 1;
|
||||
if (rnd.get_random_double() < 0.5)
|
||||
sign = -1;
|
||||
m(0) = 2*radius*rnd.get_random_double()-radius;
|
||||
m(1) = sign*sqrt(radius*radius - m(0)*m(0));
|
||||
|
||||
samples.push_back(m);
|
||||
labels.push_back(1);
|
||||
}
|
||||
|
||||
// make some samples in a circle around the origin but far away
|
||||
radius = 5.0;
|
||||
for (long i = 0; i < num; ++i)
|
||||
{
|
||||
double sign = 1;
|
||||
if (rnd.get_random_double() < 0.5)
|
||||
sign = -1;
|
||||
m(0) = 2*radius*rnd.get_random_double()-radius;
|
||||
m(1) = sign*sqrt(radius*radius - m(0)*m(0));
|
||||
|
||||
samples.push_back(m);
|
||||
labels.push_back(2);
|
||||
}
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
Loading…
Reference in New Issue