dlib/python_examples/sequence_segmenter.py

#!/usr/bin/python
# The contents of this file are in the public domain. See LICENSE_FOR_EXAMPLE_PROGRAMS.txt
#
# This example shows how to use dlib to learn to do sequence segmentation.  In
# a sequence segmentation task we are given a sequence of objects (e.g. words in
# a sentence) and we are supposed to detect certain subsequences (e.g. the names
# of people).  Therefore, in the code below we create some very simple training
# sequences and use them to learn a sequence segmentation model.  In particular,
# our sequences will be sentences represented as arrays of words and our task
# will be to learn to identify person names.  Once we have our segmentation
# model we can use it to find names in new sentences, as we will show.
#
# COMPILING/INSTALLING THE DLIB PYTHON INTERFACE
#   You can install dlib using the command:
#       pip install dlib
#
#   Alternatively, if you want to compile dlib yourself then go into the dlib
#   root folder and run:
#       python setup.py install
#   or
#       python setup.py install --yes USE_AVX_INSTRUCTIONS
#   if you have a CPU that supports AVX instructions, since this makes some
#   things run faster.  
#
#   Compiling dlib should work on any operating system so long as you have
#   CMake and boost-python installed.  On Ubuntu, this can be done easily by
#   running the command:
#       sudo apt-get install libboost-python-dev cmake
#
import sys
import dlib


# The sequence segmentation models we work with in this example are chain
# structured conditional random field style models.  Therefore, central to a
# sequence segmentation model is some method for converting the elements of a
# sequence into feature vectors. That is, while you might start out representing
# your sequence as an array of strings, the dlib interface works in terms of
# arrays of feature vectors.  Each feature vector should capture important
# information about its corresponding element in the original raw sequence.  So
# in this example, since we work with sequences of words and want to identify
# names, we will create feature vectors that tell us if the word is capitalized
# or not.  In our simple data, this will be enough to identify names.
# Therefore, we define sentence_to_vectors() which takes a sentence represented
# as a string and converts it into an array of words and then associates a
# feature vector with each word.
def sentence_to_vectors(sentence):
    # Create an empty array of vectors
    vects = dlib.vectors()
    for word in sentence.split():
        # Our vectors are very simple 1-dimensional vectors.  The value of the
        # single feature is 1 if the first letter of the word is capitalized and
        # 0 otherwise.
        if word[0].isupper():
            vects.append(dlib.vector([1]))
        else:
            vects.append(dlib.vector([0]))
    return vects


# Dlib also supports the use of a sparse vector representation.  This is more
# efficient than the above form when you have very high dimensional vectors that
# are mostly full of zeros.  In dlib, each sparse vector is represented as an
# array of pair objects.  Each pair contains an index and value.  Any index not
# listed in the vector is implicitly associated with a value of zero.
# Additionally, when using sparse vectors with dlib.train_sequence_segmenter()
# you can use "unsorted" sparse vectors.  This means you can add the index/value
# pairs into your sparse vectors in any order you want and don't need to worry
# about them being in sorted order.
def sentence_to_sparse_vectors(sentence):
    vects = dlib.sparse_vectors()
    has_cap = dlib.sparse_vector()
    no_cap = dlib.sparse_vector()
    # make has_cap equivalent to dlib.vector([1])
    has_cap.append(dlib.pair(0, 1))

    # Since we didn't add anything to no_cap it is equivalent to
    # dlib.vector([0])
    for word in sentence.split():
        if word[0].isupper():
            vects.append(has_cap)
        else:
            vects.append(no_cap)
    return vects


def print_segment(sentence, names):
    words = sentence.split()
    for name in names:
        for i in name:
            sys.stdout.write(words[i] + " ")
        sys.stdout.write("\n")


# Now let's make some training data.  Each example is a sentence as well as a
# set of ranges which indicate the locations of any names.   
names = dlib.ranges()     # make an array of dlib.range objects.
segments = dlib.rangess() # make an array of arrays of dlib.range objects.
sentences = []

sentences.append("The other day I saw a man named Jim Smith")
# We want to detect person names.  So we note that the name is located within
# the range [8, 10).  Note that we use half open ranges to identify segments.
# So in this case, the segment identifies the string "Jim Smith".
names.append(dlib.range(8, 10))
segments.append(names)
names.clear() # make names empty for use again below

sentences.append("Davis King is the main author of the dlib Library")
names.append(dlib.range(0, 2))
segments.append(names)
names.clear()

sentences.append("Bob Jones is a name and so is George Clinton")
names.append(dlib.range(0, 2))
names.append(dlib.range(8, 10))
segments.append(names)
names.clear()

sentences.append("My dog is named Bob Barker")
names.append(dlib.range(4, 6))
segments.append(names)
names.clear()

sentences.append("ABC is an acronym but John James Smith is a name")
names.append(dlib.range(5, 8))
segments.append(names)
names.clear()

sentences.append("No names in this sentence at all")
segments.append(names)
names.clear()


# Now before we can pass these training sentences to the dlib tools we need to
# convert them into arrays of vectors as discussed above.  We can use either a
# sparse or dense representation depending on our needs.  In this example, we
# show how to do it both ways.
use_sparse_vects = False
if use_sparse_vects:
    # Make an array of arrays of dlib.sparse_vector objects.
    training_sequences = dlib.sparse_vectorss()
    for s in sentences:
        training_sequences.append(sentence_to_sparse_vectors(s))
else:
    # Make an array of arrays of dlib.vector objects.
    training_sequences = dlib.vectorss()
    for s in sentences:
        training_sequences.append(sentence_to_vectors(s))

# Now that we have a simple training set we can train a sequence segmenter.
# However, the sequence segmentation trainer has some optional parameters we can
# set.  These parameters determine properties of the segmentation model we will
# learn.  See the dlib documentation for the sequence_segmenter object for a
# full discussion of their meanings.
params = dlib.segmenter_params()
params.window_size = 3
params.use_high_order_features = True
params.use_BIO_model = True
# This is the common SVM C parameter.  Larger values encourage the trainer to
# attempt to fit the data exactly but might overfit.  In general, you determine
# this parameter by cross-validation.
params.C = 10

# Train a model.  The model object is responsible for predicting the locations
# of names in new sentences.
model = dlib.train_sequence_segmenter(training_sequences, segments, params)

# Let's print out the things the model thinks are names.  The output is a set
# of ranges which are predicted to contain names.  If you run this example
# program you will see that it gets them all correct.
for i, s in enumerate(sentences):
    print_segment(s, model(training_sequences[i]))

# Let's also try segmenting a new sentence.  This will print out "Bob Bucket".
# Note that we need to remember to use the same vector representation as we used
# during training.
test_sentence = "There once was a man from Nantucket " \
                "whose name rhymed with Bob Bucket"
if use_sparse_vects:
    print_segment(test_sentence,
                  model(sentence_to_sparse_vectors(test_sentence)))
else:
    print_segment(test_sentence, model(sentence_to_vectors(test_sentence)))

# We can also measure the accuracy of a model relative to some labeled data.
# This statement prints the precision, recall, and F1-score of the model
# relative to the data in training_sequences/segments.
print("Test on training data: {}".format(
      dlib.test_sequence_segmenter(model, training_sequences, segments)))

# We can also do 5-fold cross-validation and print the resulting precision,
# recall, and F1-score.
print("Cross validation: {}".format(
      dlib.cross_validate_sequence_segmenter(training_sequences, segments, 5,
                                             params)))
Made the script runnable 2013-05-27 08:44:00 +08:00			`#!/usr/bin/python`
Added license header 2013-05-27 08:45:16 +08:00			`# The contents of this file are in the public domain. See LICENSE_FOR_EXAMPLE_PROGRAMS.txt`
Made the script runnable 2013-05-27 08:44:00 +08:00			`#`
Sort out PEP8 issues in the examples 2014-12-11 17:44:50 +08:00			`# This example shows how to use dlib to learn to do sequence segmentation. In`
			`# a sequence segmentation task we are given a sequence of objects (e.g. words in`
			`# a sentence) and we are supposed to detect certain subsequences (e.g. the names`
			`# of people). Therefore, in the code below we create some very simple training`
			`# sequences and use them to learn a sequence segmentation model. In particular,`
			`# our sequences will be sentences represented as arrays of words and our task`
			`# will be to learn to identify person names. Once we have our segmentation`
			`# model we can use it to find names in new sentences, as we will show.`
Fleshed out example program. 2013-05-28 11:23:06 +08:00			`#`
Updated compile/install instructions for python bindings. 2015-10-27 20:25:43 +08:00			`# COMPILING/INSTALLING THE DLIB PYTHON INTERFACE`
			`# You can install dlib using the command:`
			`# pip install dlib`
			`#`
			`# Alternatively, if you want to compile dlib yourself then go into the dlib`
			`# root folder and run:`
			`# python setup.py install`
			`# or`
			`# python setup.py install --yes USE_AVX_INSTRUCTIONS`
			`# if you have a CPU that supports AVX instructions, since this makes some`
			`# things run faster.`
			`#`
			`# Compiling dlib should work on any operating system so long as you have`
			`# CMake and boost-python installed. On Ubuntu, this can be done easily by`
			`# running the command:`
Sort out PEP8 issues in the examples 2014-12-11 17:44:50 +08:00			`# sudo apt-get install libboost-python-dev cmake`
Updated compile/install instructions for python bindings. 2015-10-27 20:25:43 +08:00			`#`
improved sequence segmentation examples 2013-06-07 08:27:29 +08:00			`import sys`
Sort out PEP8 issues in the examples 2014-12-11 17:44:50 +08:00			`import dlib`

improved sequence segmentation examples 2013-06-07 08:27:29 +08:00
Sort out PEP8 issues in the examples 2014-12-11 17:44:50 +08:00			`# The sequence segmentation models we work with in this example are chain`
			`# structured conditional random field style models. Therefore, central to a`
			`# sequence segmentation model is some method for converting the elements of a`
			`# sequence into feature vectors. That is, while you might start out representing`
			`# your sequence as an array of strings, the dlib interface works in terms of`
			`# arrays of feature vectors. Each feature vector should capture important`
			`# information about its corresponding element in the original raw sequence. So`
			`# in this example, since we work with sequences of words and want to identify`
			`# names, we will create feature vectors that tell us if the word is capitalized`
			`# or not. In our simple data, this will be enough to identify names.`
			`# Therefore, we define sentence_to_vectors() which takes a sentence represented`
			`# as a string and converts it into an array of words and then associates a`
			`# feature vector with each word.`
improved sequence segmentation examples 2013-06-07 08:27:29 +08:00			`def sentence_to_vectors(sentence):`
			`# Create an empty array of vectors`
			`vects = dlib.vectors()`
			`for word in sentence.split():`
Sort out PEP8 issues in the examples 2014-12-11 17:44:50 +08:00			`# Our vectors are very simple 1-dimensional vectors. The value of the`
			`# single feature is 1 if the first letter of the word is capitalized and`
			`# 0 otherwise.`
			`if word[0].isupper():`
improved sequence segmentation examples 2013-06-07 08:27:29 +08:00			`vects.append(dlib.vector([1]))`
			`else:`
			`vects.append(dlib.vector([0]))`
			`return vects`

Sort out PEP8 issues in the examples 2014-12-11 17:44:50 +08:00
			`# Dlib also supports the use of a sparse vector representation. This is more`
			`# efficient than the above form when you have very high dimensional vectors that`
			`# are mostly full of zeros. In dlib, each sparse vector is represented as an`
			`# array of pair objects. Each pair contains an index and value. Any index not`
			`# listed in the vector is implicitly associated with a value of zero.`
			`# Additionally, when using sparse vectors with dlib.train_sequence_segmenter()`
			`# you can use "unsorted" sparse vectors. This means you can add the index/value`
			`# pairs into your sparse vectors in any order you want and don't need to worry`
			`# about them being in sorted order.`
improved sequence segmentation examples 2013-06-07 08:27:29 +08:00			`def sentence_to_sparse_vectors(sentence):`
Sort out PEP8 issues in the examples 2014-12-11 17:44:50 +08:00			`vects = dlib.sparse_vectors()`
improved sequence segmentation examples 2013-06-07 08:27:29 +08:00			`has_cap = dlib.sparse_vector()`
Sort out PEP8 issues in the examples 2014-12-11 17:44:50 +08:00			`no_cap = dlib.sparse_vector()`
improved sequence segmentation examples 2013-06-07 08:27:29 +08:00			`# make has_cap equivalent to dlib.vector([1])`
Sort out PEP8 issues in the examples 2014-12-11 17:44:50 +08:00			`has_cap.append(dlib.pair(0, 1))`
improved sequence segmentation examples 2013-06-07 08:27:29 +08:00
Sort out PEP8 issues in the examples 2014-12-11 17:44:50 +08:00			`# Since we didn't add anything to no_cap it is equivalent to`
			`# dlib.vector([0])`
improved sequence segmentation examples 2013-06-07 08:27:29 +08:00			`for word in sentence.split():`
Sort out PEP8 issues in the examples 2014-12-11 17:44:50 +08:00			`if word[0].isupper():`
improved sequence segmentation examples 2013-06-07 08:27:29 +08:00			`vects.append(has_cap)`
			`else:`
			`vects.append(no_cap)`
			`return vects`


			`def print_segment(sentence, names):`
			`words = sentence.split()`
			`for name in names:`
			`for i in name:`
			`sys.stdout.write(words[i] + " ")`
			`sys.stdout.write("\n")`


Clarified a few comments and simplified the serialization code a bit. Also just cleaned up a few minor details. 2014-12-28 04:30:56 +08:00
Sort out PEP8 issues in the examples 2014-12-11 17:44:50 +08:00			`# Now let's make some training data. Each example is a sentence as well as a`
Clarified a few comments and simplified the serialization code a bit. Also just cleaned up a few minor details. 2014-12-28 04:30:56 +08:00			`# set of ranges which indicate the locations of any names.`
			`names = dlib.ranges() # make an array of dlib.range objects.`
			`segments = dlib.rangess() # make an array of arrays of dlib.range objects.`
			`sentences = []`
Sort out PEP8 issues in the examples 2014-12-11 17:44:50 +08:00
Clarified a few comments and simplified the serialization code a bit. Also just cleaned up a few minor details. 2014-12-28 04:30:56 +08:00			`sentences.append("The other day I saw a man named Jim Smith")`
Sort out PEP8 issues in the examples 2014-12-11 17:44:50 +08:00			`# We want to detect person names. So we note that the name is located within`
			`# the range [8, 10). Note that we use half open ranges to identify segments.`
Clarified a few comments and simplified the serialization code a bit. Also just cleaned up a few minor details. 2014-12-28 04:30:56 +08:00			`# So in this case, the segment identifies the string "Jim Smith".`
improved sequence segmentation examples 2013-06-07 08:27:29 +08:00			`names.append(dlib.range(8, 10))`
			`segments.append(names)`
Clarified a few comments and simplified the serialization code a bit. Also just cleaned up a few minor details. 2014-12-28 04:30:56 +08:00			`names.clear() # make names empty for use again below`
improved sequence segmentation examples 2013-06-07 08:27:29 +08:00
Clarified a few comments and simplified the serialization code a bit. Also just cleaned up a few minor details. 2014-12-28 04:30:56 +08:00			`sentences.append("Davis King is the main author of the dlib Library")`
improved sequence segmentation examples 2013-06-07 08:27:29 +08:00			`names.append(dlib.range(0, 2))`
			`segments.append(names)`
			`names.clear()`

Clarified a few comments and simplified the serialization code a bit. Also just cleaned up a few minor details. 2014-12-28 04:30:56 +08:00			`sentences.append("Bob Jones is a name and so is George Clinton")`
improved sequence segmentation examples 2013-06-07 08:27:29 +08:00			`names.append(dlib.range(0, 2))`
			`names.append(dlib.range(8, 10))`
			`segments.append(names)`
			`names.clear()`
Starting to flesh out the python interface documentation a little more. 2013-05-27 02:21:02 +08:00
Clarified a few comments and simplified the serialization code a bit. Also just cleaned up a few minor details. 2014-12-28 04:30:56 +08:00			`sentences.append("My dog is named Bob Barker")`
improved sequence segmentation examples 2013-06-07 08:27:29 +08:00			`names.append(dlib.range(4, 6))`
			`segments.append(names)`
			`names.clear()`
Starting to flesh out the python interface documentation a little more. 2013-05-27 02:21:02 +08:00
Clarified a few comments and simplified the serialization code a bit. Also just cleaned up a few minor details. 2014-12-28 04:30:56 +08:00			`sentences.append("ABC is an acronym but John James Smith is a name")`
improved sequence segmentation examples 2013-06-07 08:27:29 +08:00			`names.append(dlib.range(5, 8))`
			`segments.append(names)`
			`names.clear()`

Clarified a few comments and simplified the serialization code a bit. Also just cleaned up a few minor details. 2014-12-28 04:30:56 +08:00			`sentences.append("No names in this sentence at all")`
improved sequence segmentation examples 2013-06-07 08:27:29 +08:00			`segments.append(names)`
			`names.clear()`

Clarified a few comments and simplified the serialization code a bit. Also just cleaned up a few minor details. 2014-12-28 04:30:56 +08:00
Sort out PEP8 issues in the examples 2014-12-11 17:44:50 +08:00			`# Now before we can pass these training sentences to the dlib tools we need to`
			`# convert them into arrays of vectors as discussed above. We can use either a`
			`# sparse or dense representation depending on our needs. In this example, we`
			`# show how to do it both ways.`
			`use_sparse_vects = False`
Starting to flesh out the python interface documentation a little more. 2013-05-27 02:21:02 +08:00			`if use_sparse_vects:`
Added clarifying comments. 2013-06-12 10:18:59 +08:00			`# Make an array of arrays of dlib.sparse_vector objects.`
Fleshed out example program. 2013-05-28 11:23:06 +08:00			`training_sequences = dlib.sparse_vectorss()`
improved sequence segmentation examples 2013-06-07 08:27:29 +08:00			`for s in sentences:`
			`training_sequences.append(sentence_to_sparse_vectors(s))`
Starting to flesh out the python interface documentation a little more. 2013-05-27 02:21:02 +08:00			`else:`
Added clarifying comments. 2013-06-12 10:18:59 +08:00			`# Make an array of arrays of dlib.vector objects.`
Fleshed out example program. 2013-05-28 11:23:06 +08:00			`training_sequences = dlib.vectorss()`
improved sequence segmentation examples 2013-06-07 08:27:29 +08:00			`for s in sentences:`
			`training_sequences.append(sentence_to_vectors(s))`
Starting to flesh out the python interface documentation a little more. 2013-05-27 02:21:02 +08:00
Sort out PEP8 issues in the examples 2014-12-11 17:44:50 +08:00			`# Now that we have a simple training set we can train a sequence segmenter.`
			`# However, the sequence segmentation trainer has some optional parameters we can`
			`# set. These parameters determine properties of the segmentation model we will`
			`# learn. See the dlib documentation for the sequence_segmenter object for a`
			`# full discussion of their meanings.`
Starting to flesh out the python interface documentation a little more. 2013-05-27 02:21:02 +08:00			`params = dlib.segmenter_params()`
improved sequence segmentation examples 2013-06-07 08:27:29 +08:00			`params.window_size = 3`
Sort out PEP8 issues in the examples 2014-12-11 17:44:50 +08:00			`params.use_high_order_features = True`
Fleshed out example program. 2013-05-28 11:23:06 +08:00			`params.use_BIO_model = True`
Sort out PEP8 issues in the examples 2014-12-11 17:44:50 +08:00			`# This is the common SVM C parameter. Larger values encourage the trainer to`
			`# attempt to fit the data exactly but might overfit. In general, you determine`
			`# this parameter by cross-validation.`
improved sequence segmentation examples 2013-06-07 08:27:29 +08:00			`params.C = 10`
Starting to flesh out the python interface documentation a little more. 2013-05-27 02:21:02 +08:00
Sort out PEP8 issues in the examples 2014-12-11 17:44:50 +08:00			`# Train a model. The model object is responsible for predicting the locations`
			`# of names in new sentences.`
Fleshed out example program. 2013-05-28 11:23:06 +08:00			`model = dlib.train_sequence_segmenter(training_sequences, segments, params)`
Starting to flesh out the python interface documentation a little more. 2013-05-27 02:21:02 +08:00
Sort out PEP8 issues in the examples 2014-12-11 17:44:50 +08:00			`# Let's print out the things the model thinks are names. The output is a set`
			`# of ranges which are predicted to contain names. If you run this example`
			`# program you will see that it gets them all correct.`
			`for i, s in enumerate(sentences):`
			`print_segment(s, model(training_sequences[i]))`

			`# Let's also try segmenting a new sentence. This will print out "Bob Bucket".`
			`# Note that we need to remember to use the same vector representation as we used`
			`# during training.`
			`test_sentence = "There once was a man from Nantucket " \`
			`"whose name rhymed with Bob Bucket"`
fixed bug 2013-06-07 08:40:57 +08:00			`if use_sparse_vects:`
Sort out PEP8 issues in the examples 2014-12-11 17:44:50 +08:00			`print_segment(test_sentence,`
			`model(sentence_to_sparse_vectors(test_sentence)))`
fixed bug 2013-06-07 08:40:57 +08:00			`else:`
Made decision functions and segmenter objects callable like normal functions. 2013-06-08 11:30:43 +08:00			`print_segment(test_sentence, model(sentence_to_vectors(test_sentence)))`
improved example 2013-06-07 08:38:52 +08:00
Sort out PEP8 issues in the examples 2014-12-11 17:44:50 +08:00			`# We can also measure the accuracy of a model relative to some labeled data.`
			`# This statement prints the precision, recall, and F1-score of the model`
			`# relative to the data in training_sequences/segments.`
			`print("Test on training data: {}".format(`
			`dlib.test_sequence_segmenter(model, training_sequences, segments)))`

			`# We can also do 5-fold cross-validation and print the resulting precision,`
			`# recall, and F1-score.`
			`print("Cross validation: {}".format(`
			`dlib.cross_validate_sequence_segmenter(training_sequences, segments, 5,`
			`params)))`