2013-05-27 08:44:00 +08:00
|
|
|
#!/usr/bin/python
|
2013-05-27 08:45:16 +08:00
|
|
|
# The contents of this file are in the public domain. See LICENSE_FOR_EXAMPLE_PROGRAMS.txt
|
2013-05-27 08:44:00 +08:00
|
|
|
#
|
2014-12-11 17:44:50 +08:00
|
|
|
# This example shows how to use dlib to learn to do sequence segmentation. In
|
|
|
|
# a sequence segmentation task we are given a sequence of objects (e.g. words in
|
|
|
|
# a sentence) and we are supposed to detect certain subsequences (e.g. the names
|
|
|
|
# of people). Therefore, in the code below we create some very simple training
|
|
|
|
# sequences and use them to learn a sequence segmentation model. In particular,
|
|
|
|
# our sequences will be sentences represented as arrays of words and our task
|
|
|
|
# will be to learn to identify person names. Once we have our segmentation
|
|
|
|
# model we can use it to find names in new sentences, as we will show.
|
2013-05-28 11:23:06 +08:00
|
|
|
#
|
2015-10-27 20:25:43 +08:00
|
|
|
# COMPILING/INSTALLING THE DLIB PYTHON INTERFACE
|
|
|
|
# You can install dlib using the command:
|
|
|
|
# pip install dlib
|
|
|
|
#
|
|
|
|
# Alternatively, if you want to compile dlib yourself then go into the dlib
|
|
|
|
# root folder and run:
|
|
|
|
# python setup.py install
|
|
|
|
# or
|
|
|
|
# python setup.py install --yes USE_AVX_INSTRUCTIONS
|
|
|
|
# if you have a CPU that supports AVX instructions, since this makes some
|
|
|
|
# things run faster.
|
|
|
|
#
|
|
|
|
# Compiling dlib should work on any operating system so long as you have
|
|
|
|
# CMake and boost-python installed. On Ubuntu, this can be done easily by
|
|
|
|
# running the command:
|
2014-12-11 17:44:50 +08:00
|
|
|
# sudo apt-get install libboost-python-dev cmake
|
2015-10-27 20:25:43 +08:00
|
|
|
#
|
2013-06-07 08:27:29 +08:00
|
|
|
import sys
|
2014-12-11 17:44:50 +08:00
|
|
|
import dlib
|
|
|
|
|
2013-06-07 08:27:29 +08:00
|
|
|
|
2014-12-11 17:44:50 +08:00
|
|
|
# The sequence segmentation models we work with in this example are chain
|
|
|
|
# structured conditional random field style models. Therefore, central to a
|
|
|
|
# sequence segmentation model is some method for converting the elements of a
|
|
|
|
# sequence into feature vectors. That is, while you might start out representing
|
|
|
|
# your sequence as an array of strings, the dlib interface works in terms of
|
|
|
|
# arrays of feature vectors. Each feature vector should capture important
|
|
|
|
# information about its corresponding element in the original raw sequence. So
|
|
|
|
# in this example, since we work with sequences of words and want to identify
|
|
|
|
# names, we will create feature vectors that tell us if the word is capitalized
|
|
|
|
# or not. In our simple data, this will be enough to identify names.
|
|
|
|
# Therefore, we define sentence_to_vectors() which takes a sentence represented
|
|
|
|
# as a string and converts it into an array of words and then associates a
|
|
|
|
# feature vector with each word.
|
2013-06-07 08:27:29 +08:00
|
|
|
def sentence_to_vectors(sentence):
|
|
|
|
# Create an empty array of vectors
|
|
|
|
vects = dlib.vectors()
|
|
|
|
for word in sentence.split():
|
2014-12-11 17:44:50 +08:00
|
|
|
# Our vectors are very simple 1-dimensional vectors. The value of the
|
|
|
|
# single feature is 1 if the first letter of the word is capitalized and
|
|
|
|
# 0 otherwise.
|
|
|
|
if word[0].isupper():
|
2013-06-07 08:27:29 +08:00
|
|
|
vects.append(dlib.vector([1]))
|
|
|
|
else:
|
|
|
|
vects.append(dlib.vector([0]))
|
|
|
|
return vects
|
|
|
|
|
2014-12-11 17:44:50 +08:00
|
|
|
|
|
|
|
# Dlib also supports the use of a sparse vector representation. This is more
|
|
|
|
# efficient than the above form when you have very high dimensional vectors that
|
|
|
|
# are mostly full of zeros. In dlib, each sparse vector is represented as an
|
|
|
|
# array of pair objects. Each pair contains an index and value. Any index not
|
|
|
|
# listed in the vector is implicitly associated with a value of zero.
|
|
|
|
# Additionally, when using sparse vectors with dlib.train_sequence_segmenter()
|
|
|
|
# you can use "unsorted" sparse vectors. This means you can add the index/value
|
|
|
|
# pairs into your sparse vectors in any order you want and don't need to worry
|
|
|
|
# about them being in sorted order.
|
2013-06-07 08:27:29 +08:00
|
|
|
def sentence_to_sparse_vectors(sentence):
|
2014-12-11 17:44:50 +08:00
|
|
|
vects = dlib.sparse_vectors()
|
2013-06-07 08:27:29 +08:00
|
|
|
has_cap = dlib.sparse_vector()
|
2014-12-11 17:44:50 +08:00
|
|
|
no_cap = dlib.sparse_vector()
|
2013-06-07 08:27:29 +08:00
|
|
|
# make has_cap equivalent to dlib.vector([1])
|
2014-12-11 17:44:50 +08:00
|
|
|
has_cap.append(dlib.pair(0, 1))
|
2013-06-07 08:27:29 +08:00
|
|
|
|
2014-12-11 17:44:50 +08:00
|
|
|
# Since we didn't add anything to no_cap it is equivalent to
|
|
|
|
# dlib.vector([0])
|
2013-06-07 08:27:29 +08:00
|
|
|
for word in sentence.split():
|
2014-12-11 17:44:50 +08:00
|
|
|
if word[0].isupper():
|
2013-06-07 08:27:29 +08:00
|
|
|
vects.append(has_cap)
|
|
|
|
else:
|
|
|
|
vects.append(no_cap)
|
|
|
|
return vects
|
|
|
|
|
|
|
|
|
|
|
|
def print_segment(sentence, names):
|
|
|
|
words = sentence.split()
|
|
|
|
for name in names:
|
|
|
|
for i in name:
|
|
|
|
sys.stdout.write(words[i] + " ")
|
|
|
|
sys.stdout.write("\n")
|
|
|
|
|
|
|
|
|
2014-12-28 04:30:56 +08:00
|
|
|
|
2014-12-11 17:44:50 +08:00
|
|
|
# Now let's make some training data. Each example is a sentence as well as a
|
2014-12-28 04:30:56 +08:00
|
|
|
# set of ranges which indicate the locations of any names.
|
|
|
|
names = dlib.ranges() # make an array of dlib.range objects.
|
|
|
|
segments = dlib.rangess() # make an array of arrays of dlib.range objects.
|
|
|
|
sentences = []
|
2014-12-11 17:44:50 +08:00
|
|
|
|
2014-12-28 04:30:56 +08:00
|
|
|
sentences.append("The other day I saw a man named Jim Smith")
|
2014-12-11 17:44:50 +08:00
|
|
|
# We want to detect person names. So we note that the name is located within
|
|
|
|
# the range [8, 10). Note that we use half open ranges to identify segments.
|
2014-12-28 04:30:56 +08:00
|
|
|
# So in this case, the segment identifies the string "Jim Smith".
|
2013-06-07 08:27:29 +08:00
|
|
|
names.append(dlib.range(8, 10))
|
|
|
|
segments.append(names)
|
2014-12-28 04:30:56 +08:00
|
|
|
names.clear() # make names empty for use again below
|
2013-06-07 08:27:29 +08:00
|
|
|
|
2014-12-28 04:30:56 +08:00
|
|
|
sentences.append("Davis King is the main author of the dlib Library")
|
2013-06-07 08:27:29 +08:00
|
|
|
names.append(dlib.range(0, 2))
|
|
|
|
segments.append(names)
|
|
|
|
names.clear()
|
|
|
|
|
2014-12-28 04:30:56 +08:00
|
|
|
sentences.append("Bob Jones is a name and so is George Clinton")
|
2013-06-07 08:27:29 +08:00
|
|
|
names.append(dlib.range(0, 2))
|
|
|
|
names.append(dlib.range(8, 10))
|
|
|
|
segments.append(names)
|
|
|
|
names.clear()
|
2013-05-27 02:21:02 +08:00
|
|
|
|
2014-12-28 04:30:56 +08:00
|
|
|
sentences.append("My dog is named Bob Barker")
|
2013-06-07 08:27:29 +08:00
|
|
|
names.append(dlib.range(4, 6))
|
|
|
|
segments.append(names)
|
|
|
|
names.clear()
|
2013-05-27 02:21:02 +08:00
|
|
|
|
2014-12-28 04:30:56 +08:00
|
|
|
sentences.append("ABC is an acronym but John James Smith is a name")
|
2013-06-07 08:27:29 +08:00
|
|
|
names.append(dlib.range(5, 8))
|
|
|
|
segments.append(names)
|
|
|
|
names.clear()
|
|
|
|
|
2014-12-28 04:30:56 +08:00
|
|
|
sentences.append("No names in this sentence at all")
|
2013-06-07 08:27:29 +08:00
|
|
|
segments.append(names)
|
|
|
|
names.clear()
|
|
|
|
|
2014-12-28 04:30:56 +08:00
|
|
|
|
2014-12-11 17:44:50 +08:00
|
|
|
# Now before we can pass these training sentences to the dlib tools we need to
|
|
|
|
# convert them into arrays of vectors as discussed above. We can use either a
|
|
|
|
# sparse or dense representation depending on our needs. In this example, we
|
|
|
|
# show how to do it both ways.
|
|
|
|
use_sparse_vects = False
|
2013-05-27 02:21:02 +08:00
|
|
|
if use_sparse_vects:
|
2013-06-12 10:18:59 +08:00
|
|
|
# Make an array of arrays of dlib.sparse_vector objects.
|
2013-05-28 11:23:06 +08:00
|
|
|
training_sequences = dlib.sparse_vectorss()
|
2013-06-07 08:27:29 +08:00
|
|
|
for s in sentences:
|
|
|
|
training_sequences.append(sentence_to_sparse_vectors(s))
|
2013-05-27 02:21:02 +08:00
|
|
|
else:
|
2013-06-12 10:18:59 +08:00
|
|
|
# Make an array of arrays of dlib.vector objects.
|
2013-05-28 11:23:06 +08:00
|
|
|
training_sequences = dlib.vectorss()
|
2013-06-07 08:27:29 +08:00
|
|
|
for s in sentences:
|
|
|
|
training_sequences.append(sentence_to_vectors(s))
|
2013-05-27 02:21:02 +08:00
|
|
|
|
2014-12-11 17:44:50 +08:00
|
|
|
# Now that we have a simple training set we can train a sequence segmenter.
|
|
|
|
# However, the sequence segmentation trainer has some optional parameters we can
|
|
|
|
# set. These parameters determine properties of the segmentation model we will
|
|
|
|
# learn. See the dlib documentation for the sequence_segmenter object for a
|
|
|
|
# full discussion of their meanings.
|
2013-05-27 02:21:02 +08:00
|
|
|
params = dlib.segmenter_params()
|
2013-06-07 08:27:29 +08:00
|
|
|
params.window_size = 3
|
2014-12-11 17:44:50 +08:00
|
|
|
params.use_high_order_features = True
|
2013-05-28 11:23:06 +08:00
|
|
|
params.use_BIO_model = True
|
2014-12-11 17:44:50 +08:00
|
|
|
# This is the common SVM C parameter. Larger values encourage the trainer to
|
|
|
|
# attempt to fit the data exactly but might overfit. In general, you determine
|
|
|
|
# this parameter by cross-validation.
|
2013-06-07 08:27:29 +08:00
|
|
|
params.C = 10
|
2013-05-27 02:21:02 +08:00
|
|
|
|
2014-12-11 17:44:50 +08:00
|
|
|
# Train a model. The model object is responsible for predicting the locations
|
|
|
|
# of names in new sentences.
|
2013-05-28 11:23:06 +08:00
|
|
|
model = dlib.train_sequence_segmenter(training_sequences, segments, params)
|
2013-05-27 02:21:02 +08:00
|
|
|
|
2014-12-11 17:44:50 +08:00
|
|
|
# Let's print out the things the model thinks are names. The output is a set
|
|
|
|
# of ranges which are predicted to contain names. If you run this example
|
|
|
|
# program you will see that it gets them all correct.
|
|
|
|
for i, s in enumerate(sentences):
|
|
|
|
print_segment(s, model(training_sequences[i]))
|
|
|
|
|
|
|
|
# Let's also try segmenting a new sentence. This will print out "Bob Bucket".
|
|
|
|
# Note that we need to remember to use the same vector representation as we used
|
|
|
|
# during training.
|
|
|
|
test_sentence = "There once was a man from Nantucket " \
|
|
|
|
"whose name rhymed with Bob Bucket"
|
2013-06-07 08:40:57 +08:00
|
|
|
if use_sparse_vects:
|
2014-12-11 17:44:50 +08:00
|
|
|
print_segment(test_sentence,
|
|
|
|
model(sentence_to_sparse_vectors(test_sentence)))
|
2013-06-07 08:40:57 +08:00
|
|
|
else:
|
2013-06-08 11:30:43 +08:00
|
|
|
print_segment(test_sentence, model(sentence_to_vectors(test_sentence)))
|
2013-06-07 08:38:52 +08:00
|
|
|
|
2014-12-11 17:44:50 +08:00
|
|
|
# We can also measure the accuracy of a model relative to some labeled data.
|
|
|
|
# This statement prints the precision, recall, and F1-score of the model
|
|
|
|
# relative to the data in training_sequences/segments.
|
|
|
|
print("Test on training data: {}".format(
|
|
|
|
dlib.test_sequence_segmenter(model, training_sequences, segments)))
|
|
|
|
|
|
|
|
# We can also do 5-fold cross-validation and print the resulting precision,
|
|
|
|
# recall, and F1-score.
|
|
|
|
print("Cross validation: {}".format(
|
|
|
|
dlib.cross_validate_sequence_segmenter(training_sequences, segments, 5,
|
|
|
|
params)))
|