mirror of https://github.com/davisking/dlib.git
Added rank_unlabeled_training_samples()
This commit is contained in:
parent
015643e078
commit
b5e8d9d835
|
@ -47,6 +47,7 @@
|
||||||
#include "svm/svm_multiclass_linear_trainer.h"
|
#include "svm/svm_multiclass_linear_trainer.h"
|
||||||
#include "svm/sequence_labeler.h"
|
#include "svm/sequence_labeler.h"
|
||||||
#include "svm/assignment_function.h"
|
#include "svm/assignment_function.h"
|
||||||
|
#include "svm/active_learning.h"
|
||||||
|
|
||||||
#endif // DLIB_SVm_HEADER
|
#endif // DLIB_SVm_HEADER
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,162 @@
|
||||||
|
// Copyright (C) 2012 Davis E. King (davis@dlib.net)
|
||||||
|
// License: Boost Software License See LICENSE.txt for the full license.
|
||||||
|
#ifndef DLIB_ACTIVE_LEARnING_H__
|
||||||
|
#define DLIB_ACTIVE_LEARnING_H__
|
||||||
|
|
||||||
|
#include "active_learning_abstract.h"
|
||||||
|
|
||||||
|
#include "svm_c_linear_dcd_trainer.h"
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
namespace dlib
|
||||||
|
{
|
||||||
|
|
||||||
|
enum active_learning_mode
|
||||||
|
{
|
||||||
|
max_min_margin,
|
||||||
|
ratio_margin
|
||||||
|
};
|
||||||
|
|
||||||
|
// ----------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
template <
|
||||||
|
typename kernel_type,
|
||||||
|
typename in_sample_vector_type,
|
||||||
|
typename in_scalar_vector_type,
|
||||||
|
typename in_sample_vector_type2
|
||||||
|
>
|
||||||
|
std::vector<unsigned long> impl_rank_unlabeled_training_samples (
|
||||||
|
const svm_c_linear_dcd_trainer<kernel_type>& trainer,
|
||||||
|
const in_sample_vector_type& samples,
|
||||||
|
const in_scalar_vector_type& labels,
|
||||||
|
const in_sample_vector_type2& unlabeled_samples,
|
||||||
|
const active_learning_mode mode
|
||||||
|
)
|
||||||
|
{
|
||||||
|
DLIB_ASSERT(is_vector(unlabeled_samples) &&
|
||||||
|
(samples.size() == 0 || is_learning_problem(samples, labels)) ,
|
||||||
|
"\t std::vector<unsigned long> rank_unlabeled_training_samples()"
|
||||||
|
<< "\n\t Invalid inputs were given to this function"
|
||||||
|
<< "\n\t is_vector(unlabeled_samples): " << is_vector(unlabeled_samples)
|
||||||
|
<< "\n\t is_learning_problem(samples, labels): " << is_learning_problem(samples, labels)
|
||||||
|
<< "\n\t samples.size(): " << samples.size()
|
||||||
|
<< "\n\t labels.size(): " << labels.size()
|
||||||
|
);
|
||||||
|
|
||||||
|
// If there aren't any training samples then all unlabeled_samples are equally good.
|
||||||
|
// So just report an arbitrary ordering.
|
||||||
|
if (samples.size() == 0 || unlabeled_samples.size() == 0)
|
||||||
|
{
|
||||||
|
std::vector<unsigned long> ret(unlabeled_samples.size());
|
||||||
|
for (unsigned long i = 0; i < ret.size(); ++i)
|
||||||
|
ret[i] = i;
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
// We are going to score each unlabeled sample and put the score and index into
|
||||||
|
// results. Then at the end of this function we just sort it and return the indices.
|
||||||
|
std::vector<std::pair<double, unsigned long> > results;
|
||||||
|
results.resize(unlabeled_samples.size());
|
||||||
|
|
||||||
|
// make sure we use this trainer's ability to warm start itself since that will make
|
||||||
|
// this whole function run a lot faster. But first, we need to find out what the state
|
||||||
|
// we will be warm starting from is.
|
||||||
|
typedef typename svm_c_linear_dcd_trainer<kernel_type>::optimizer_state optimizer_state;
|
||||||
|
optimizer_state state;
|
||||||
|
trainer.train(samples, labels, state); // call train() just to get state
|
||||||
|
|
||||||
|
decision_function<kernel_type> df;
|
||||||
|
|
||||||
|
std::vector<typename kernel_type::sample_type> temp_samples;
|
||||||
|
std::vector<typename kernel_type::scalar_type> temp_labels;
|
||||||
|
temp_samples.reserve(samples.size()+1);
|
||||||
|
temp_labels.reserve(labels.size()+1);
|
||||||
|
temp_samples.assign(samples.begin(), samples.end());
|
||||||
|
temp_labels.assign(labels.begin(), labels.end());
|
||||||
|
temp_samples.resize(temp_samples.size()+1);
|
||||||
|
temp_labels.resize(temp_labels.size()+1);
|
||||||
|
|
||||||
|
|
||||||
|
for (unsigned long i = 0; i < unlabeled_samples.size(); ++i)
|
||||||
|
{
|
||||||
|
temp_samples.back() = unlabeled_samples(i);
|
||||||
|
// figure out the margin for each possible labeling of this sample.
|
||||||
|
|
||||||
|
optimizer_state temp(state);
|
||||||
|
temp_labels.back() = +1;
|
||||||
|
df = trainer.train(temp_samples, temp_labels, temp);
|
||||||
|
const double margin_p = temp_labels.back()*df(temp_samples.back());
|
||||||
|
|
||||||
|
temp = state;
|
||||||
|
temp_labels.back() = -1;
|
||||||
|
df = trainer.train(temp_samples, temp_labels, temp);
|
||||||
|
const double margin_n = temp_labels.back()*df(temp_samples.back());
|
||||||
|
|
||||||
|
if (mode == max_min_margin)
|
||||||
|
{
|
||||||
|
// The score for this sample is its min possible margin over possible labels.
|
||||||
|
// Therefore, this score measures how much flexibility we have to label this
|
||||||
|
// sample however we want. The intuition being that the most useful points to
|
||||||
|
// label are the ones that are still free to obtain either label.
|
||||||
|
results[i] = std::make_pair(std::min(margin_p, margin_n), i);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// In this case, the score for the sample is a ratio that tells how close the
|
||||||
|
// two margin values are to each other. The closer they are the better. So in
|
||||||
|
// this case we are saying we are looking for samples that have the same
|
||||||
|
// preference for either class label.
|
||||||
|
if (std::abs(margin_p) >= std::abs(margin_n))
|
||||||
|
{
|
||||||
|
if (margin_p != 0)
|
||||||
|
results[i] = std::make_pair(margin_n/margin_p, i);
|
||||||
|
else // if both are == 0 then say 0/0 == 1
|
||||||
|
results[i] = std::make_pair(1, i);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
results[i] = std::make_pair(margin_p/margin_n, i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// sort the results so the highest scoring samples come first.
|
||||||
|
std::sort(results.rbegin(), results.rend());
|
||||||
|
|
||||||
|
// transfer results into a vector with just sample indices so we can return it.
|
||||||
|
std::vector<unsigned long> ret(results.size());
|
||||||
|
for (unsigned long i = 0; i < ret.size(); ++i)
|
||||||
|
ret[i] = results[i].second;
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ----------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
template <
|
||||||
|
typename kernel_type,
|
||||||
|
typename in_sample_vector_type,
|
||||||
|
typename in_scalar_vector_type,
|
||||||
|
typename in_sample_vector_type2
|
||||||
|
>
|
||||||
|
std::vector<unsigned long> rank_unlabeled_training_samples (
|
||||||
|
const svm_c_linear_dcd_trainer<kernel_type>& trainer,
|
||||||
|
const in_sample_vector_type& samples,
|
||||||
|
const in_scalar_vector_type& labels,
|
||||||
|
const in_sample_vector_type2& unlabeled_samples,
|
||||||
|
const active_learning_mode mode = max_min_margin
|
||||||
|
)
|
||||||
|
{
|
||||||
|
return impl_rank_unlabeled_training_samples(trainer,
|
||||||
|
mat(samples),
|
||||||
|
mat(labels),
|
||||||
|
mat(unlabeled_samples),
|
||||||
|
mode);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ----------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // DLIB_ACTIVE_LEARnING_H__
|
||||||
|
|
|
@ -0,0 +1,75 @@
|
||||||
|
// Copyright (C) 2012 Davis E. King (davis@dlib.net)
|
||||||
|
// License: Boost Software License See LICENSE.txt for the full license.
|
||||||
|
#undef DLIB_ACTIVE_LEARnING_ABSTRACT_H__
|
||||||
|
#ifdef DLIB_ACTIVE_LEARnING_ABSTRACT_H__
|
||||||
|
|
||||||
|
#include "svm_c_linear_dcd_trainer_abstract.h"
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
namespace dlib
|
||||||
|
{
|
||||||
|
|
||||||
|
// ----------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
enum active_learning_mode
|
||||||
|
{
|
||||||
|
max_min_margin,
|
||||||
|
ratio_margin
|
||||||
|
};
|
||||||
|
|
||||||
|
// ----------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
template <
|
||||||
|
typename kernel_type,
|
||||||
|
typename in_sample_vector_type,
|
||||||
|
typename in_scalar_vector_type,
|
||||||
|
typename in_sample_vector_type2
|
||||||
|
>
|
||||||
|
std::vector<unsigned long> rank_unlabeled_training_samples (
|
||||||
|
const svm_c_linear_dcd_trainer<kernel_type>& trainer,
|
||||||
|
const in_sample_vector_type& samples,
|
||||||
|
const in_scalar_vector_type& labels,
|
||||||
|
const in_sample_vector_type2& unlabeled_samples,
|
||||||
|
const active_learning_mode mode = max_min_margin
|
||||||
|
);
|
||||||
|
/*!
|
||||||
|
requires
|
||||||
|
- if (samples.size() != 0) then
|
||||||
|
- it must be legal to call trainer.train(samples, labels)
|
||||||
|
- is_learning_problem(samples, labels) == true
|
||||||
|
- unlabeled_samples must contain the same kind of vectors as samples.
|
||||||
|
- unlabeled_samples, samples, and labels must be matrices or types of
|
||||||
|
objects convertible to a matrix via mat().
|
||||||
|
- is_vector(unlabeled_samples) == true
|
||||||
|
ensures
|
||||||
|
- Suppose that we wish to learn a binary classifier by calling
|
||||||
|
trainer.train(samples, labels) but we are also interested in selecting one of
|
||||||
|
the elements of unlabeled_samples to add to our training data. Since doing
|
||||||
|
this requires us to find out the label of the sample, a potentially tedious
|
||||||
|
or expensive process, we would like to select the "best" element from
|
||||||
|
unlabeled_samples for labeling. The rank_unlabeled_training_samples()
|
||||||
|
attempts to find this "best" element. In particular, this function returns a
|
||||||
|
ranked list of all the elements in unlabeled_samples such that that the
|
||||||
|
"best" elements come first.
|
||||||
|
- The method used by this function is described in the paper:
|
||||||
|
Support Vector Machine Active Learning with Applications to Text Classification
|
||||||
|
by Simon Tong and Daphne Koller
|
||||||
|
In particular, this function implements the MaxMin Margin and Ratio Margin
|
||||||
|
selection strategies described in the paper. Moreover, the mode argument
|
||||||
|
to this function selects which of these strategies is used.
|
||||||
|
- returns a std::vector V such that:
|
||||||
|
- V contains a list of all the indices from unlabeled_samples. Moreover,
|
||||||
|
they are ordered so that the most useful samples come first.
|
||||||
|
- V.size() == unlabeled_samples.size()
|
||||||
|
- unlabeled_samples[V[0]] == The best sample to add into the training set.
|
||||||
|
- unlabeled_samples[V[1]] == The second best sample to add into the training set.
|
||||||
|
- unlabeled_samples[V[i]] == The i-th best sample to add into the training set.
|
||||||
|
!*/
|
||||||
|
|
||||||
|
// ----------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // DLIB_ACTIVE_LEARnING_ABSTRACT_H__
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue