From 3330d34fd16cd726c2f2b04f5d8a4d839721121f Mon Sep 17 00:00:00 2001 From: Davis King Date: Mon, 3 Jun 2013 21:26:56 -0400 Subject: [PATCH] Added an example for the sequence_segmenter --- examples/CMakeLists.txt | 1 + examples/sequence_segmenter_ex.cpp | 242 +++++++++++++++++++++++++++++ 2 files changed, 243 insertions(+) create mode 100644 examples/sequence_segmenter_ex.cpp diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 27058201d..13a57f6d7 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -69,6 +69,7 @@ add_example(running_stats_ex) add_example(rvm_ex) add_example(rvm_regression_ex) add_example(sequence_labeler_ex) +add_example(sequence_segmenter_ex) add_example(server_http_ex) add_example(server_iostream_ex) add_example(sockets_ex) diff --git a/examples/sequence_segmenter_ex.cpp b/examples/sequence_segmenter_ex.cpp new file mode 100644 index 000000000..2b236c021 --- /dev/null +++ b/examples/sequence_segmenter_ex.cpp @@ -0,0 +1,242 @@ +// The contents of this file are in the public domain. See LICENSE_FOR_EXAMPLE_PROGRAMS.txt +/* + + This example shows how to use dlib to learn to perform sequence segmentation. In a + sequence segmentation task we are given a sequence of objects (e.g. words in a + sentence) and we are supposed to detect certain subsequences (e.g. the names of + people). In the code below we create some very simple sequence/segmentation training + examples and use them to learn a sequence segmentation model. In particular, our + sequences will be arrays of strings and our task will be to learn to identify person + names. Once we have our segmentation model we can use it to find names in new + sentences as we will show. +*/ + + +#include +#include +#include +#include + +using namespace std; +using namespace dlib; + + +// ---------------------------------------------------------------------------------------- + +class feature_extractor +{ + /* + The sequence segmentation models we work with in this example are chain structured + conditional random field style models. Therefore, central to a sequence + segmentation model is a feature extractor object. This object defines all the + properties of the model such as how many features it will use, and more importantly, + how they are calculated. + */ + +public: + // This should be the type used to represent an input sequence. It can be + // anything so long as it has a .size() which returns the length of the sequence. + typedef std::vector sequence_type; + + // The next four lines define high-level properties of the feature extraction model. + // See the documentation for the sequence_labeler object for an extended discussion of + // how they are used (note that the main body of the documentation is at the top of the + // file documenting the sequence_labeler). + const static bool use_BIO_model = true; + const static bool use_high_order_features = true; + const static bool allow_negative_weights = true; + unsigned long window_size() const { return 3; } + + // This function defines the dimensionality of the vectors output by the get_features() + // function defined below. + unsigned long num_features() const { return 1; } + + template + void get_features ( + feature_setter& set_feature, + const sequence_type& sentence, + unsigned long position + ) const + /*! + requires + - position < sentence.size() + - set_feature is a function object which allows expressions of the form: + - set_features((unsigned long)feature_index, (double)feature_value); + - set_features((unsigned long)feature_index); + ensures + - This function computes a feature vector which should capture the properties + of sentence[position] that are informative relative to the sequence + segmentation task you are trying to perform. + - The output feature vector is returned as a sparse vector by invoking set_feature(). + For example, to set the feature with an index of 55 to the value of 1 + this method would call: + set_feature(55); + Or equivalently: + set_feature(55,1); + Therefore, the first argument to set_feature is the index of the feature + to be set while the second argument is the value the feature should take. + Additionally, note that calling set_feature() multiple times with the + same feature index does NOT overwrite the old value, it adds to the + previous value. For example, if you call set_feature(55) 3 times then it + will result in feature 55 having a value of 3. + - This function only calls set_feature() with feature_index values < num_features() + !*/ + { + // The model in this example program is very simple. Our features only look at the + // capitalization pattern of the words. So we have a single feature which checks + // if the first letter is capitalized or not. + + // if the first character of the word sentence[position] is capitalized then + // we set our only feature 1, otherwise it remains set to 0. + if (isupper(sentence[position][0])) + set_feature(0); + } +}; + +// We need to define serialize() and deserialize() for our feature extractor if we want +// to be able to serialize and deserialize our learned models. In this case the +// implementation is empty since our feature_extractor doesn't have any state. But you +// might define more complex feature extractors which have state that needs to be saved. +void serialize(const feature_extractor&, std::ostream&) {} +void deserialize(feature_extractor&, std::istream&) {} + +// ---------------------------------------------------------------------------------------- + +void make_training_examples ( + std::vector >& samples, + std::vector > >& segments +) +/*! + ensures + - This function fills samples with example sentences and segments with the + locations of person names that should be segmented out. + - #samples.size() == #segments.size() +!*/ +{ + std::vector > names; + + + // Here we make our first training example. split() turns the string into an array of + // 10 words and then we store that into samples. + samples.push_back(split("The other day I saw a man named Jim Smith")); + // We want to detect person names. So we note that the name is located within the + // range [8, 10). Note that we use half open ranges to identify segments. So in this + // case, the segment identifies the string "Jim Smith". + names.push_back(make_pair(8, 10)); + segments.push_back(names); names.clear(); + + // Now we add a few more example sentences + + samples.push_back(split("Davis King is the main author of the dlib Library")); + names.push_back(make_pair(0, 2)); + segments.push_back(names); names.clear(); + + + samples.push_back(split("Bob Jones is a name and so is George Clinton")); + names.push_back(make_pair(0, 2)); + names.push_back(make_pair(8, 10)); + segments.push_back(names); names.clear(); + + + samples.push_back(split("My dog is named Bob Barker")); + names.push_back(make_pair(4, 6)); + segments.push_back(names); names.clear(); + + + samples.push_back(split("ABC is an acronym but John James Smith is a name")); + names.push_back(make_pair(5, 8)); + segments.push_back(names); names.clear(); + + + samples.push_back(split("No names in this sentence at all")); + segments.push_back(names); names.clear(); +} + +// ---------------------------------------------------------------------------------------- + +void print_segment ( + const std::vector& sentence, + const std::pair& segment +) +{ + // Recall that a segment is a half open range starting with .first and ending just + // before .second. + for (unsigned long i = segment.first; i < segment.second; ++i) + cout << sentence[i] << " "; + cout << endl; +} + +// ---------------------------------------------------------------------------------------- + +int main() +{ + // Finally we make it into the main program body. So the first thing we do is get our + // training data. + std::vector > samples; + std::vector > > segments; + make_training_examples(samples, segments); + + + // Next we use the structural_sequence_segmentation_trainer to learn our segmentation + // model based on just the samples and segments. But first we setup some of its + // parameters. + structural_sequence_segmentation_trainer trainer; + // This is the common SVM C parameter. Larger values encourage the trainer to attempt + // to fit the data exactly but might overfit. In general, you determine this parameter + // by cross-validation. + trainer.set_c(10); + // This trainer can use multiple CPU cores to speed up the training. So set this to + // the number of available CPU cores. + trainer.set_num_threads(4); + + + // Learn to do sequence segmentation from the dataset + sequence_segmenter segmenter = trainer.train(samples, segments); + + + // Lets print out all the segments our segmenter detects. + for (unsigned long i = 0; i < samples.size(); ++i) + { + // get all the detected segments in samples[i] + std::vector > seg = segmenter(samples[i]); + // Print each of them + for (unsigned long j = 0; j < seg.size(); ++j) + { + print_segment(samples[i], seg[j]); + } + } + + + std::vector sentence(split("There once was a man from Nantucket whose name rhymed with Bob Bucket")); + std::vector > seg = segmenter(sentence); + for (unsigned long j = 0; j < seg.size(); ++j) + { + print_segment(sentence, seg[j]); + } + + + + // We can also test the accuracy of the segmenter on a dataset. This statement simply + // tests on the training data. In this case we will see that it predicts everything + // correctly. + cout << "\nprecision, recall, f1-score: " << test_sequence_segmenter(segmenter, samples, segments); + // Similarly, we can do 5-fold cross-validation and print the results. Just as before, + // we see everything is predicted correctly. + cout << "precision, recall, f1-score: " << cross_validate_sequence_segmenter(trainer, samples, segments, 5); + + + + + + // Finally, the segmenter can be serialized to disk just like most dlib objects. + ofstream fout("segmenter.dat", ios::binary); + serialize(segmenter, fout); + fout.close(); + + // recall from disk + ifstream fin("segmenter.dat", ios::binary); + deserialize(segmenter, fin); +} + +// ---------------------------------------------------------------------------------------- +