Added evaluate_detectors() to make it easy to run a bunch of HOG detectors

efficiently, even when their window sizes differ.
2014-05-26 18:46:13 -04:00 · 2014-05-26 18:46:13 -04:00 · 4f275bd7bd
parent 09af3eb856
commit 4f275bd7bd
2 changed files with 378 additions and 121 deletions
--- a/dlib/image_processing/scan_fhog_pyramid.h
+++ b/dlib/image_processing/scan_fhog_pyramid.h
@ -226,7 +226,6 @@ namespace dlib
                return num;
            }

-        private:
            std::vector<matrix<float> > filters;
            std::vector<std::vector<matrix<float,0,1> > > row_filters, col_filters;
        };
@ -361,14 +360,6 @@ namespace dlib
            height = temp.height();
        }

-        static bool compare_pair_rect (
-            const std::pair<double, rectangle>& a,
-            const std::pair<double, rectangle>& b
-        )
-        {
-            return a.first < b.first;
-        }
-
        void get_mapped_rect_and_metadata (
            const unsigned long number_pyramid_levels,
            const rectangle& rect,
@ -389,12 +380,6 @@ namespace dlib

        typedef array<array2d<float> > fhog_image;

-        static rectangle apply_filters_to_fhog (
-            const fhog_filterbank& w,
-            const fhog_image& feats,
-            array2d<float>& saliency_image
-        );
-
        feature_extractor_type fe;
        array<fhog_image> feats;
        int cell_size;
@ -422,54 +407,56 @@ namespace dlib

 // ----------------------------------------------------------------------------------------

-    template <typename T, typename U>
-    rectangle scan_fhog_pyramid<T,U>::
-    apply_filters_to_fhog (
-        const fhog_filterbank& w,
-        const fhog_image& feats,
-        array2d<float>& saliency_image
-    )
+    namespace impl
    {
-        const unsigned long num_separable_filters = w.num_separable_filters();
-        rectangle area;
-        // use the separable filters if they would be faster than running the regular filters.
-        if (num_separable_filters > w.filters.size()*std::min(w.filters[0].nr(),w.filters[0].nc())/3.0)
+        template <typename fhog_filterbank>
+        rectangle apply_filters_to_fhog (
+            const fhog_filterbank& w,
+            const array<array2d<float> >& feats,
+            array2d<float>& saliency_image
+        )
        {
-            area = spatially_filter_image(feats[0], saliency_image, w.filters[0]);
-            for (unsigned long i = 1; i < w.filters.size(); ++i)
+            const unsigned long num_separable_filters = w.num_separable_filters();
+            rectangle area;
+            // use the separable filters if they would be faster than running the regular filters.
+            if (num_separable_filters > w.filters.size()*std::min(w.filters[0].nr(),w.filters[0].nc())/3.0)
            {
-                // now we filter but the output adds to saliency_image rather than
-                // overwriting it.
-                spatially_filter_image(feats[i], saliency_image, w.filters[i], 1, false, true);
-            }
-        }
-        else
-        {
-            saliency_image.clear();
-            array2d<float> scratch;
-
-            // find the first filter to apply
-            unsigned long i = 0;
-            while (i < w.row_filters.size() && w.row_filters[i].size() == 0) 
-                ++i;
-
-            for (; i < w.row_filters.size(); ++i)
-            {
-                for (unsigned long j = 0; j < w.row_filters[i].size(); ++j)
+                area = spatially_filter_image(feats[0], saliency_image, w.filters[0]);
+                for (unsigned long i = 1; i < w.filters.size(); ++i)
                {
-                    if (saliency_image.size() == 0)
-                        area = float_spatially_filter_image_separable(feats[i], saliency_image, w.row_filters[i][j], w.col_filters[i][j],scratch,false);
-                    else
-                        area = float_spatially_filter_image_separable(feats[i], saliency_image, w.row_filters[i][j], w.col_filters[i][j],scratch,true);
+                    // now we filter but the output adds to saliency_image rather than
+                    // overwriting it.
+                    spatially_filter_image(feats[i], saliency_image, w.filters[i], 1, false, true);
                }
            }
-            if (saliency_image.size() == 0)
+            else
            {
-                saliency_image.set_size(feats[0].nr(), feats[0].nc());
-                assign_all_pixels(saliency_image, 0);
+                saliency_image.clear();
+                array2d<float> scratch;
+
+                // find the first filter to apply
+                unsigned long i = 0;
+                while (i < w.row_filters.size() && w.row_filters[i].size() == 0) 
+                    ++i;
+
+                for (; i < w.row_filters.size(); ++i)
+                {
+                    for (unsigned long j = 0; j < w.row_filters[i].size(); ++j)
+                    {
+                        if (saliency_image.size() == 0)
+                            area = float_spatially_filter_image_separable(feats[i], saliency_image, w.row_filters[i][j], w.col_filters[i][j],scratch,false);
+                        else
+                            area = float_spatially_filter_image_separable(feats[i], saliency_image, w.row_filters[i][j], w.col_filters[i][j],scratch,true);
+                    }
+                }
+                if (saliency_image.size() == 0)
+                {
+                    saliency_image.set_size(feats[0].nr(), feats[0].nc());
+                    assign_all_pixels(saliency_image, 0);
+                }
            }
+            return area;
        }
-        return area;
    }

 // ----------------------------------------------------------------------------------------
@ -561,6 +548,70 @@ namespace dlib
        fe = fe_;
    }

+// ----------------------------------------------------------------------------------------
+
+    namespace impl
+    {
+        template <
+            typename pyramid_type,
+            typename image_type,
+            typename feature_extractor_type
+            >
+        void create_fhog_pyramid (
+            const image_type& img,
+            const feature_extractor_type& fe,
+            array<array<array2d<float> > >& feats,
+            int cell_size,
+            int filter_rows_padding,
+            int filter_cols_padding,
+            unsigned long min_pyramid_layer_width,
+            unsigned long min_pyramid_layer_height,
+            unsigned long max_pyramid_levels
+        )
+        {
+            unsigned long levels = 0;
+            rectangle rect = get_rect(img);
+
+            // figure out how many pyramid levels we should be using based on the image size
+            pyramid_type pyr;
+            do
+            {
+                rect = pyr.rect_down(rect);
+                ++levels;
+            } while (rect.width() >= min_pyramid_layer_width && rect.height() >= min_pyramid_layer_height &&
+                levels < max_pyramid_levels);
+
+            if (feats.max_size() < levels)
+                feats.set_max_size(levels);
+            feats.set_size(levels);
+
+
+            typedef typename image_type::type pixel_type;
+            typedef typename image_type::mem_manager_type mem_manager_type;
+
+            // build our feature pyramid
+            fe(img, feats[0], cell_size,filter_rows_padding,filter_cols_padding);
+            DLIB_ASSERT(feats[0].size() == fe.get_num_planes(), 
+                "Invalid feature extractor used with dlib::scan_fhog_pyramid.  The output does not have the \n"
+                "indicated number of planes.");
+
+            if (feats.size() > 1)
+            {
+                array2d<pixel_type,mem_manager_type> temp1, temp2;
+                pyr(img, temp1);
+                fe(temp1, feats[1], cell_size,filter_rows_padding,filter_cols_padding);
+                swap(temp1,temp2);
+
+                for (unsigned long i = 2; i < feats.size(); ++i)
+                {
+                    pyr(temp2, temp1);
+                    fe(temp1, feats[i], cell_size,filter_rows_padding,filter_cols_padding);
+                    swap(temp1,temp2);
+                }
+            }
+        }
+    }
+
 // ----------------------------------------------------------------------------------------

    template <
@ -575,48 +626,11 @@ namespace dlib
        const image_type& img
    )
    {
-        unsigned long levels = 0;
-        rectangle rect = get_rect(img);
-
-        // figure out how many pyramid levels we should be using based on the image size
-        pyramid_type pyr;
-        do
-        {
-            rect = pyr.rect_down(rect);
-            ++levels;
-        } while (rect.width() >= min_pyramid_layer_width && rect.height() >= min_pyramid_layer_height &&
-                 levels < max_pyramid_levels);
-
-        if (feats.max_size() < levels)
-            feats.set_max_size(levels);
-        feats.set_size(levels);
-
        unsigned long width, height;
        compute_fhog_window_size(width,height);
-
-        typedef typename image_type::type pixel_type;
-        typedef typename image_type::mem_manager_type mem_manager_type;
-
-        // build our feature pyramid
-        fe(img, feats[0], cell_size,height,width);
-        DLIB_ASSERT(feats[0].size() == fe.get_num_planes(), 
-            "Invalid feature extractor used with dlib::scan_fhog_pyramid.  The output does not have the \n"
-            "indicated number of planes.");
-
-        if (feats.size() > 1)
-        {
-            array2d<pixel_type,mem_manager_type> temp1, temp2;
-            pyr(img, temp1);
-            fe(temp1, feats[1], cell_size,height,width);
-            swap(temp1,temp2);
-
-            for (unsigned long i = 2; i < feats.size(); ++i)
-            {
-                pyr(temp2, temp1);
-                fe(temp1, feats[i], cell_size,height,width);
-                swap(temp1,temp2);
-            }
-        }
+        impl::create_fhog_pyramid<Pyramid_type>(img, fe, feats, cell_size, height,
+            width, min_pyramid_layer_width, min_pyramid_layer_height,
+            max_pyramid_levels);
    }

 // ----------------------------------------------------------------------------------------
@ -730,6 +744,82 @@ namespace dlib
        max_pyramid_levels = max_levels;
    }

+// ----------------------------------------------------------------------------------------
+
+    namespace impl
+    {
+        inline bool compare_pair_rect (
+            const std::pair<double, rectangle>& a,
+            const std::pair<double, rectangle>& b
+        )
+        {
+            return a.first < b.first;
+        }
+
+        template <
+            typename pyramid_type,
+            typename feature_extractor_type,
+            typename fhog_filterbank
+            >
+        void detect_from_fhog_pyramid (
+            const array<array<array2d<float> > >& feats,
+            const feature_extractor_type& fe,
+            const fhog_filterbank& w,
+            const double thresh,
+            const unsigned long det_box_height,
+            const unsigned long det_box_width,
+            const int cell_size,
+            const int filter_rows_padding,
+            const int filter_cols_padding,
+            std::vector<std::pair<double, rectangle> >& dets
+        ) 
+        {
+            dets.clear();
+
+            array2d<float> saliency_image;
+            pyramid_type pyr;
+
+            // for all pyramid levels
+            for (unsigned long l = 0; l < feats.size(); ++l)
+            {
+                const rectangle area = apply_filters_to_fhog(w, feats[l], saliency_image);
+
+                // now search the saliency image for any detections
+                for (long r = area.top(); r <= area.bottom(); ++r)
+                {
+                    for (long c = area.left(); c <= area.right(); ++c)
+                    {
+                        // if we found a detection
+                        if (saliency_image[r][c] >= thresh)
+                        {
+                            rectangle rect = fe.feats_to_image(centered_rect(point(c,r),det_box_width,det_box_height), 
+                                cell_size, filter_rows_padding, filter_cols_padding);
+                            rect = pyr.rect_up(rect, l);
+                            dets.push_back(std::make_pair(saliency_image[r][c], rect));
+                        }
+                    }
+                }
+            }
+
+            std::sort(dets.rbegin(), dets.rend(), compare_pair_rect);
+        }
+
+        inline bool overlaps_any_box (
+            const test_box_overlap& tester,
+            const std::vector<rect_detection>& rects,
+            const rect_detection& rect
+        ) 
+        {
+            for (unsigned long i = 0; i < rects.size(); ++i)
+            {
+                if (tester(rects[i].rect, rect.rect))
+                    return true;
+            }
+            return false;
+        }
+
+    }
+
 // ----------------------------------------------------------------------------------------

    template <
@ -754,36 +844,11 @@ namespace dlib
            << "\n\t this: " << this
            );

-        dets.clear();
-
        unsigned long width, height;
        compute_fhog_window_size(width,height);

-        array2d<float> saliency_image;
-        pyramid_type pyr;
-
-        // for all pyramid levels
-        for (unsigned long l = 0; l < feats.size(); ++l)
-        {
-            const rectangle area = apply_filters_to_fhog(w, feats[l], saliency_image);
-
-            // now search the saliency image for any detections
-            for (long r = area.top(); r <= area.bottom(); ++r)
-            {
-                for (long c = area.left(); c <= area.right(); ++c)
-                {
-                    // if we found a detection
-                    if (saliency_image[r][c] >= thresh)
-                    {
-                        rectangle rect = fe.feats_to_image(centered_rect(point(c,r),width-2*padding,height-2*padding), cell_size, height,width);
-                        rect = pyr.rect_up(rect, l);
-                        dets.push_back(std::make_pair(saliency_image[r][c], rect));
-                    }
-                }
-            }
-        }
-
-        std::sort(dets.rbegin(), dets.rend(), compare_pair_rect);
+        impl::detect_from_fhog_pyramid<pyramid_type>(feats, fe, w, thresh,
+            height-2*padding, width-2*padding, cell_size, height, width, dets);
    }

 // ----------------------------------------------------------------------------------------
@ -1145,6 +1210,134 @@ namespace dlib
    };

 // ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename pyramid_type,
+        typename image_type
+        >
+    void evaluate_detectors (
+        const std::vector<object_detector<scan_fhog_pyramid<pyramid_type> > >& detectors,
+        const image_type& img,
+        std::vector<rect_detection>& dets,
+        const double adjust_threshold = 0
+    )
+    {
+        typedef scan_fhog_pyramid<pyramid_type> scanner_type;
+
+        dets.clear();
+        if (detectors.size() == 0)
+            return;
+
+        const int cell_size = detectors[0].get_scanner().get_cell_size();
+
+        // Find the maximum sized filters and also most extreme pyramiding settings used.
+        unsigned long max_filter_width = 0;
+        unsigned long max_filter_height = 0;
+        unsigned long min_pyramid_layer_width = std::numeric_limits<unsigned long>::max();
+        unsigned long min_pyramid_layer_height = std::numeric_limits<unsigned long>::max();
+        unsigned long max_pyramid_levels = 0;
+        bool all_cell_sizes_the_same = true;
+        for (unsigned long i = 0; i < detectors.size(); ++i)
+        {
+            const scanner_type& scanner = detectors[i].get_scanner();
+            max_filter_width = std::max(max_filter_width, scanner.get_fhog_window_width());
+            max_filter_height = std::max(max_filter_height, scanner.get_fhog_window_height());
+            max_pyramid_levels = std::max(max_pyramid_levels, scanner.get_max_pyramid_levels());
+            min_pyramid_layer_width = std::min(min_pyramid_layer_width, scanner.get_min_pyramid_layer_width());
+            min_pyramid_layer_height = std::min(min_pyramid_layer_height, scanner.get_min_pyramid_layer_height());
+            if (cell_size != scanner.get_cell_size())
+                all_cell_sizes_the_same = false;
+        }
+
+        std::vector<rect_detection> dets_accum;
+        // Do to the HOG feature extraction to make the fhog pyramid.  Again, note that we
+        // are making a pyramid that will work with any of the detectors.  But only if all
+        // the cell sizes are the same.  If they aren't then we have to calculate the
+        // pyramid for each detector individually.
+        array<array<array2d<float> > > feats;
+        if (all_cell_sizes_the_same)
+        {
+            impl::create_fhog_pyramid<pyramid_type>(img,
+                detectors[0].get_scanner().get_feature_extractor(), feats, cell_size,
+                max_filter_height, max_filter_width, min_pyramid_layer_width,
+                min_pyramid_layer_height, max_pyramid_levels);
+        }
+
+        std::vector<std::pair<double, rectangle> > temp_dets;
+        for (unsigned long i = 0; i < detectors.size(); ++i)
+        {
+            const scanner_type& scanner = detectors[i].get_scanner();
+            if (!all_cell_sizes_the_same)
+            {
+                impl::create_fhog_pyramid<pyramid_type>(img,
+                    scanner.get_feature_extractor(), feats, scanner.get_cell_size(),
+                    max_filter_height, max_filter_width, min_pyramid_layer_width,
+                    min_pyramid_layer_height, max_pyramid_levels);
+            }
+
+            const unsigned long det_box_width  = scanner.get_fhog_window_width()  - 2*scanner.get_padding();
+            const unsigned long det_box_height = scanner.get_fhog_window_height() - 2*scanner.get_padding();
+            // A single detector object might itself have multiple weight vectors in it. So
+            // we need to evaluate all of them.
+            for (unsigned d = 0; d < detectors[i].num_detectors(); ++d)
+            {
+                const double thresh = detectors[i].get_processed_w(d).w(scanner.get_num_dimensions());
+
+                impl::detect_from_fhog_pyramid<pyramid_type>(feats, scanner.get_feature_extractor(),
+                    detectors[i].get_processed_w(d).get_detect_argument(), thresh,
+                    det_box_height, det_box_width, cell_size, max_filter_height,
+                    max_filter_width, temp_dets);
+
+                for (unsigned long j = 0; j < temp_dets.size(); ++j)
+                {
+                    rect_detection temp;
+                    temp.detection_confidence = temp_dets[j].first-thresh;
+                    temp.weight_index = i;
+                    temp.rect = temp_dets[j].second;
+                    dets_accum.push_back(temp);
+                }
+            }
+        }
+
+
+        // Do non-max suppression
+        dets.clear();
+        if (detectors.size() > 1)
+            std::sort(dets_accum.rbegin(), dets_accum.rend());
+        for (unsigned long i = 0; i < dets_accum.size(); ++i)
+        {
+            const test_box_overlap tester = detectors[dets_accum[i].weight_index].get_overlap_tester();
+            if (impl::overlaps_any_box(tester, dets, dets_accum[i]))
+                continue;
+
+            dets.push_back(dets_accum[i]);
+        }
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename Pyramid_type,
+        typename image_type
+        >
+    std::vector<rectangle> evaluate_detectors (
+        const std::vector<object_detector<scan_fhog_pyramid<Pyramid_type> > >& detectors,
+        const image_type& img,
+        const double adjust_threshold = 0
+    )
+    {
+        std::vector<rectangle> out_dets;
+        std::vector<rect_detection> dets;
+        evaluate_detectors(detectors, img, dets, adjust_threshold);
+        out_dets.reserve(dets.size());
+        for (unsigned long i = 0; i < dets.size(); ++i)
+            out_dets.push_back(dets[i].rect);
+        return out_dets;
+    }
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------

 }

--- a/dlib/image_processing/scan_fhog_pyramid_abstract.h
+++ b/dlib/image_processing/scan_fhog_pyramid_abstract.h
@ -693,6 +693,70 @@ namespace dlib
        provides deserialization support 
    !*/

+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename pyramid_type,
+        typename image_type
+        >
+    void evaluate_detectors (
+        const std::vector<object_detector<scan_fhog_pyramid<pyramid_type>>>& detectors,
+        const image_type& img,
+        std::vector<rect_detection>& dets,
+        const double adjust_threshold = 0
+    );
+    /*!
+        ensures
+            - This function runs each of the provided object_detector objects over img and
+              stores the resulting detections into #dets.  Importantly, this function is
+              faster than running each detector individually because it computes the HOG
+              features only once and then reuses them for each detector.  However, it is
+              important to note that this speedup is only possible if all the detectors use
+              the same cell_size parameter that determines how HOG features are computed.
+              If different cell_size values are used then this function will not be any
+              faster than running the detectors individually.
+            - This function applies non-max suppression to the outputs from all detectors
+              and therefore none of the outputs will overlap with each other.
+            - To be precise, this function performs object detection on the given image and
+              stores the detected objects into #dets.  In particular, we will have that:
+                - #dets is sorted such that the highest confidence detections come first.
+                  E.g. element 0 is the best detection, element 1 the next best, and so on.
+                - #dets.size() == the number of detected objects.
+                - #dets[i].detection_confidence == The strength of the i-th detection.
+                  Larger values indicate that the detector is more confident that #dets[i]
+                  is a correct detection rather than being a false alarm.  Moreover, the
+                  detection_confidence is equal to the detection value output by the
+                  scanner minus the threshold value stored at the end of the weight vector.
+                - #dets[i].rect == the bounding box for the i-th detection.
+                - The detection #dets[i].rect was produced by detectors[#dets[i].weight_index].
+            - The detection threshold is adjusted by having adjust_threshold added to it.
+              Therefore, an adjust_threshold value > 0 makes detecting objects harder while
+              a negative value makes it easier.  Moreover, the following will be true for
+              all valid i:
+                - #dets[i].detection_confidence >= adjust_threshold
+              This means that, for example, you can obtain the maximum possible number of
+              detections by setting adjust_threshold equal to negative infinity.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename pyramid_type,
+        typename image_type
+        >
+    std::vector<rectangle> evaluate_detectors (
+        const std::vector<object_detector<scan_fhog_pyramid<pyramid_type>>>& detectors,
+        const image_type& img,
+        const double adjust_threshold = 0
+    );
+    /*!
+        ensures
+            - This function just calls the above evaluate_detectors() routine and copies
+              the output dets into a vector<rectangle> object and returns it.  Therefore,
+              this function is provided for convenience.
+    !*/
+
 // ----------------------------------------------------------------------------------------

 }