From bbeac285d15be3fa93ab19be7c7d3d5a9bd45f7c Mon Sep 17 00:00:00 2001 From: Evgeniy Fominov Date: Fri, 22 Jul 2016 16:11:13 +0300 Subject: [PATCH] Shape predictor trainer optimizations (#126) * Shape predictor trainer optimizations * Fixed performance leak in single thread mode & made VS2010 support --- dlib/image_processing/shape_predictor.h | 147 +++++++++++++----- .../shape_predictor_abstract.h | 21 +++ examples/train_shape_predictor_ex.cpp | 5 +- 3 files changed, 133 insertions(+), 40 deletions(-) diff --git a/dlib/image_processing/shape_predictor.h b/dlib/image_processing/shape_predictor.h index 29607b43b..07b75e747 100644 --- a/dlib/image_processing/shape_predictor.h +++ b/dlib/image_processing/shape_predictor.h @@ -80,7 +80,7 @@ namespace dlib i = 0; while (i < splits.size()) { - if (feature_pixel_values[splits[i].idx1] - feature_pixel_values[splits[i].idx2] > splits[i].thresh) + if ((float)feature_pixel_values[splits[i].idx1] - (float)feature_pixel_values[splits[i].idx2] > splits[i].thresh) i = left_child(i); else i = right_child(i); @@ -235,7 +235,7 @@ namespace dlib // ------------------------------------------------------------------------------------ - template + template void extract_feature_pixel_values ( const image_type& img_, const rectangle& rect, @@ -243,7 +243,7 @@ namespace dlib const matrix& reference_shape, const std::vector& reference_pixel_anchor_idx, const std::vector >& reference_pixel_deltas, - std::vector& feature_pixel_values + std::vector& feature_pixel_values ) /*! requires @@ -453,6 +453,7 @@ namespace dlib _num_test_splits = 20; _feature_pool_region_padding = 0; _verbose = false; + _num_threads = 0; } unsigned long get_cascade_depth ( @@ -605,6 +606,15 @@ namespace dlib _verbose = false; } + unsigned long get_num_threads ( + ) const { return _num_threads; } + void set_num_threads ( + unsigned long num + ) + { + _num_threads = num; + } + template shape_predictor train ( const image_array& images, @@ -661,13 +671,17 @@ namespace dlib << "\n\t you can't have a part that is always set to OBJECT_PART_NOT_PRESENT." ); + // creating thread pool. if num_threads <= 1, trainer should work in caller thread + thread_pool tp(_num_threads > 1 ? _num_threads : 0); - - + // determining the type of features used for this type of images + typedef typename std::remove_const::type>::type image_type; + typedef typename image_traits::pixel_type pixel_type; + typedef typename pixel_traits::basic_pixel_type feature_type; rnd.set_seed(get_random_seed()); - std::vector samples; + std::vector> samples; const matrix initial_shape = populate_training_sample_shapes(objects, samples); const std::vector > > pixel_coordinates = randomly_sample_pixel_coordinates(initial_shape); @@ -688,17 +702,17 @@ namespace dlib // First compute the feature_pixel_values for each training sample at this // level of the cascade. - for (unsigned long i = 0; i < samples.size(); ++i) + parallel_for(tp, 0, samples.size(), [&](unsigned long i) { - extract_feature_pixel_values(images[samples[i].image_idx], samples[i].rect, - samples[i].current_shape, initial_shape, anchor_idx, - deltas, samples[i].feature_pixel_values); - } + impl::extract_feature_pixel_values(images[samples[i].image_idx], samples[i].rect, + samples[i].current_shape, initial_shape, anchor_idx, + deltas, samples[i].feature_pixel_values); + }, 1); // Now start building the trees at this cascade level. for (unsigned long i = 0; i < get_num_trees_per_cascade_level(); ++i) { - forests[cascade].push_back(make_regression_tree(samples, pixel_coordinates[cascade])); + forests[cascade].push_back(make_regression_tree(tp, samples, pixel_coordinates[cascade])); if (_verbose) { @@ -745,7 +759,8 @@ namespace dlib } } - struct training_sample + template + struct training_sample { /*! @@ -760,15 +775,18 @@ namespace dlib - present == 0/1 mask saying which parts of target_shape are present. - rect == the position of the object in the image_idx-th image. All shape coordinates are coded relative to this rectangle. + - diff_shape == temporary value for holding difference between current + shape and target shape !*/ unsigned long image_idx; rectangle rect; - matrix target_shape; - matrix present; + matrix target_shape; + matrix present; - matrix current_shape; - std::vector feature_pixel_values; + matrix current_shape; + matrix diff_shape; + std::vector feature_pixel_values; void swap(training_sample& item) { @@ -777,12 +795,15 @@ namespace dlib target_shape.swap(item.target_shape); present.swap(item.present); current_shape.swap(item.current_shape); + diff_shape.swap(item.diff_shape); feature_pixel_values.swap(item.feature_pixel_values); } }; + template impl::regression_tree make_regression_tree ( - std::vector& samples, + thread_pool& tp, + std::vector>& samples, const std::vector >& pixel_coordinates ) const { @@ -795,19 +816,53 @@ namespace dlib // walk the tree in breadth first order const unsigned long num_split_nodes = static_cast(std::pow(2.0, (double)get_tree_depth())-1); std::vector > sums(num_split_nodes*2+1); - for (unsigned long i = 0; i < samples.size(); ++i) - sums[0] += samples[i].target_shape - samples[i].current_shape; + if (tp.num_threads_in_pool() > 1) + { + // Here we need to calculate shape differences and store sum of differences into sums[0] + // to make it I am splitting of samples into blocks, each block will be processed by + // separate thread, and the sum of differences of each block is stored into separate + // place in block_sums - for (unsigned long i = 0; i < num_split_nodes; ++i) + const unsigned long num_workers = std::max(1UL, tp.num_threads_in_pool()); + const unsigned long num = samples.size(); + const unsigned long block_size = std::max(1UL, (num + num_workers - 1) / num_workers); + std::vector > block_sums(num_workers); + + parallel_for(tp, 0, num_workers, [&](unsigned long block) + { + const unsigned long block_begin = block * block_size; + const unsigned long block_end = std::min(num, block_begin + block_size); + for (unsigned long i = block_begin; i < block_end; ++i) + { + samples[i].diff_shape = samples[i].target_shape - samples[i].current_shape; + block_sums[block] += samples[i].diff_shape; + } + }, 1); + + // now calculate the total result from separate blocks + for (unsigned long i = 0; i < block_sums.size(); ++i) + sums[0] += block_sums[i]; + } + else + { + // synchronous implementation + for (unsigned long i = 0; i < samples.size(); ++i) + { + samples[i].diff_shape = samples[i].target_shape - samples[i].current_shape; + sums[0] += samples[i].diff_shape; + } + } + + for (unsigned long i = 0; i < num_split_nodes; ++i) { std::pair range = parts.front(); parts.pop_front(); - const impl::split_feature split = generate_split(samples, range.first, + const impl::split_feature split = generate_split(tp, samples, range.first, range.second, pixel_coordinates, sums[i], sums[left_child(i)], sums[right_child(i)]); tree.splits.push_back(split); - const unsigned long mid = partition_samples(split, samples, range.first, range.second); + const unsigned long mid = partition_samples(split, samples, range.first, range.second); parts.push_back(std::make_pair(range.first, mid)); parts.push_back(std::make_pair(mid, range.second)); @@ -821,7 +876,7 @@ namespace dlib { // Get the present counts for each dimension so we can divide each // dimension by the number of observations we have on it to find the mean - // displacement in each leaf. + // displacement in each leaf. present_counts = 0; for (unsigned long j = parts[i].first; j < parts[i].second; ++j) present_counts += samples[j].present; @@ -833,7 +888,7 @@ namespace dlib tree.leaf_values[i] = zeros_matrix(samples[0].target_shape); // now adjust the current shape based on these predictions - for (unsigned long j = parts[i].first; j < parts[i].second; ++j) + parallel_for(tp, parts[i].first, parts[i].second, [&](unsigned long j) { samples[j].current_shape += tree.leaf_values[i]; // For parts that aren't present in the training data, we just make @@ -846,7 +901,7 @@ namespace dlib if (samples[j].present(k) == 0) samples[j].target_shape(k) = samples[j].current_shape(k); } - } + }, 1); } return tree; @@ -873,8 +928,10 @@ namespace dlib return feat; } + template impl::split_feature generate_split ( - const std::vector& samples, + thread_pool& tp, + const std::vector>& samples, unsigned long begin, unsigned long end, const std::vector >& pixel_coordinates, @@ -896,24 +953,33 @@ namespace dlib std::vector > left_sums(num_test_splits); std::vector left_cnt(num_test_splits); + const unsigned long num_workers = std::max(1UL, tp.num_threads_in_pool()); + const unsigned long block_size = std::max(1UL, (num_test_splits + num_workers - 1) / num_workers); + // now compute the sums of vectors that go left for each feature - matrix temp; - for (unsigned long j = begin; j < end; ++j) + parallel_for(tp, 0, num_workers, [&](unsigned long block) { - temp = samples[j].target_shape-samples[j].current_shape; - for (unsigned long i = 0; i < num_test_splits; ++i) + const unsigned long block_begin = block * block_size; + const unsigned long block_end = std::min(block_begin + block_size, num_test_splits); + + for (unsigned long j = begin; j < end; ++j) { - if (samples[j].feature_pixel_values[feats[i].idx1] - samples[j].feature_pixel_values[feats[i].idx2] > feats[i].thresh) + for (unsigned long i = block_begin; i < block_end; ++i) { - left_sums[i] += temp; - ++left_cnt[i]; + if ((float)samples[j].feature_pixel_values[feats[i].idx1] - (float)samples[j].feature_pixel_values[feats[i].idx2] > feats[i].thresh) + { + left_sums[i] += samples[j].diff_shape; + ++left_cnt[i]; + } } } - } + + }, 1); // now figure out which feature is the best double best_score = -1; unsigned long best_feat = 0; + matrix temp; for (unsigned long i = 0; i < num_test_splits; ++i) { // check how well the feature splits the space. @@ -944,9 +1010,10 @@ namespace dlib return feats[best_feat]; } + template unsigned long partition_samples ( const impl::split_feature& split, - std::vector& samples, + std::vector>& samples, unsigned long begin, unsigned long end ) const @@ -958,7 +1025,7 @@ namespace dlib unsigned long i = begin; for (unsigned long j = begin; j < end; ++j) { - if (samples[j].feature_pixel_values[split.idx1] - samples[j].feature_pixel_values[split.idx2] > split.thresh) + if ((float)samples[j].feature_pixel_values[split.idx1] - (float)samples[j].feature_pixel_values[split.idx2] > split.thresh) { samples[i].swap(samples[j]); ++i; @@ -969,9 +1036,10 @@ namespace dlib + template matrix populate_training_sample_shapes( const std::vector >& objects, - std::vector& samples + std::vector>& samples ) const { samples.clear(); @@ -982,7 +1050,7 @@ namespace dlib { for (unsigned long j = 0; j < objects[i].size(); ++j) { - training_sample sample; + training_sample sample; sample.image_idx = i; sample.rect = objects[i][j].get_rect(); object_to_shape(objects[i][j], sample.target_shape, sample.present); @@ -1099,6 +1167,7 @@ namespace dlib unsigned long _num_test_splits; double _feature_pool_region_padding; bool _verbose; + unsigned long _num_threads; }; // ---------------------------------------------------------------------------------------- diff --git a/dlib/image_processing/shape_predictor_abstract.h b/dlib/image_processing/shape_predictor_abstract.h index a37117a9b..cd3e21541 100644 --- a/dlib/image_processing/shape_predictor_abstract.h +++ b/dlib/image_processing/shape_predictor_abstract.h @@ -148,6 +148,7 @@ namespace dlib - #get_num_test_splits() == 20 - #get_feature_pool_region_padding() == 0 - #get_random_seed() == "" + - #get_num_threads() == 0 - This object will not be verbose !*/ @@ -367,6 +368,26 @@ namespace dlib - #get_num_test_splits() == num !*/ + unsigned long get_num_threads ( + ) const; + /*! + ensures + - When running training process, it is possible to make some parts of it parallel + using CPU threads with #parallel_for() extension and creating #thread_pool internally + When get_num_threads() == 0, trainer will not create threads and all processing will + be done in the calling thread + !*/ + + void set_num_threads ( + unsigned long num + ); + /*! + requires + - num >= 0 + ensures + - #get_num_threads() == num + !*/ + void be_verbose ( ); /*! diff --git a/examples/train_shape_predictor_ex.cpp b/examples/train_shape_predictor_ex.cpp index d0d3e1b28..bb3d85ddc 100644 --- a/examples/train_shape_predictor_ex.cpp +++ b/examples/train_shape_predictor_ex.cpp @@ -39,7 +39,7 @@ std::vector > get_interocular_distances ( // ---------------------------------------------------------------------------------------- int main(int argc, char** argv) -{ +{ try { // In this example we are going to train a shape_predictor based on the @@ -108,6 +108,9 @@ int main(int argc, char** argv) trainer.set_nu(0.05); trainer.set_tree_depth(2); + // some parts of training process can be parellelized. + // Trainer will use this count of threads when possible + trainer.set_num_threads(2); // Tell the trainer to print status messages to the console so we can // see how long the training will take.