Refined the scan_image_pyramid interface a little. In particular, I split the

get_feature_vector() method into two separate functions so the interface is a little simpler and more flexible.
2011-12-24 18:39:35 -05:00 · 2011-12-24 18:39:35 -05:00 · 3ebf0f2e3f
parent 578322dca2
commit 3ebf0f2e3f
3 changed files with 140 additions and 72 deletions
--- a/dlib/image_processing/scan_image_pyramid.h
+++ b/dlib/image_processing/scan_image_pyramid.h
@ -86,8 +86,11 @@ namespace dlib

        void get_feature_vector (
            const std::vector<rectangle>& rects,
-            feature_vector_type& psi,
-            std::vector<rectangle>& mapped_rects
+            feature_vector_type& psi
+        ) const;
+
+        const rectangle get_best_matching_rect (
+            const rectangle& rect
        ) const;

        template <typename T, typename U>
@ -128,6 +131,13 @@ namespace dlib
            deserialize(item.rects, in);
        }

+        void get_mapped_rect_and_metadata (
+            rectangle rect,
+            rectangle& mapped_rect,
+            detection_template& best_template,
+            unsigned long& best_level
+        ) const;
+

        feature_extractor_type feats_config; // just here to hold configuration.  use it to populate the feats elements.
        typename array<feature_extractor_type>::kernel_2a feats;
@ -527,6 +537,107 @@ namespace dlib
        std::sort(dets.rbegin(), dets.rend(), compare_pair_rect);
    }

+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename Pyramid_type,
+        typename Feature_extractor_type
+        >
+    const rectangle scan_image_pyramid<Pyramid_type,Feature_extractor_type>::
+    get_best_matching_rect (
+        const rectangle& rect
+    ) const
+    {
+        // make sure requires clause is not broken
+        DLIB_ASSERT(get_num_detection_templates() > 0 &&
+                    is_loaded_with_image(),
+            "\t const rectangle scan_image_pyramid::get_best_matching_rect()"
+            << "\n\t Invalid inputs were given to this function "
+            << "\n\t get_num_detection_templates(): " << get_num_detection_templates()
+            << "\n\t is_loaded_with_image(): " << is_loaded_with_image()
+            << "\n\t this: " << this
+            );
+
+        rectangle mapped_rect;
+        detection_template best_template;
+        unsigned long best_level;
+        get_mapped_rect_and_metadata(rect, mapped_rect, best_template, best_level);
+        return mapped_rect;
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    template <
+        typename Pyramid_type,
+        typename Feature_extractor_type
+        >
+    void scan_image_pyramid<Pyramid_type,Feature_extractor_type>::
+    get_mapped_rect_and_metadata (
+        rectangle rect,
+        rectangle& mapped_rect,
+        detection_template& best_template,
+        unsigned long& best_level
+    ) const
+    {
+        pyramid_type pyr;
+        // Figure out the pyramid level which best matches rect against one of our 
+        // detection template object boxes.
+        best_level = 0;
+        double match_score = std::numeric_limits<double>::infinity();
+
+        const dlib::vector<double,2> p(rect.width(), rect.height());
+
+        // for all the levels
+        for (unsigned long l = 0; l < feats.size(); ++l)
+        {
+            // Run the center point through the feature/image space transformation just to make
+            // sure we exactly replicate the procedure for shifting an object_box used elsewhere 
+            // in this file.
+            const point origin = feats[l].feat_to_image_space(feats[l].image_to_feat_space(center(pyr.rect_down(rect,l))));
+
+            for (unsigned long t = 0; t < det_templates.size(); ++t)
+            {
+                // Map this detection template into the normal image space and see how
+                // close it is to the rect we are looking for.  We do the translation here
+                // because the rect_up() routine takes place using integer arithmetic and
+                // could potentially give slightly different results with and without the
+                // translation.
+                rectangle mapped_rect = translate_rect(det_templates[t].object_box, origin);
+                mapped_rect = pyr.rect_up(mapped_rect, l);
+
+                const dlib::vector<double,2> p2(mapped_rect.width(),
+                                                mapped_rect.height());
+                if ((p-p2).length() < match_score)
+                {
+                    match_score = (p-p2).length();
+                    best_level = l;
+                    best_template = det_templates[t];
+                }
+            }
+        }
+
+
+        // Now get the features out of feats[best_level].  But first translate best_template 
+        // into the right spot (it should be centered at the location determined by rect)
+        // and convert it into the feature image coordinate system.
+        rect = pyr.rect_down(rect,best_level);
+        const point offset = -feats[best_level].image_to_feat_space(point(0,0));
+        const point origin = feats[best_level].image_to_feat_space(center(rect)) + offset;
+        for (unsigned long k = 0; k < best_template.rects.size(); ++k)
+        {
+            rectangle temp = best_template.rects[k];
+            temp = feats[best_level].image_to_feat_space(temp);
+            temp = translate_rect(temp, origin);
+            temp = get_rect(feats[best_level]).intersect(temp);
+            best_template.rects[k] = temp;
+        }
+
+        // The input rectangle was mapped to one of the detection templates.  Reverse the process
+        // to figure out what the mapped rectangle is in the original input space.
+        mapped_rect = translate_rect(best_template.object_box, feats[best_level].feat_to_image_space(origin-offset));
+        mapped_rect = pyr.rect_up(mapped_rect, best_level);
+    }
+
 // ----------------------------------------------------------------------------------------

    template <
@ -536,8 +647,7 @@ namespace dlib
    void scan_image_pyramid<Pyramid_type,Feature_extractor_type>::
    get_feature_vector (
        const std::vector<rectangle>& rects,
-        feature_vector_type& psi,
-        std::vector<rectangle>& mapped_rects
+        feature_vector_type& psi
    ) const
    {
        // make sure requires clause is not broken
@ -555,74 +665,18 @@ namespace dlib

        psi = 0;

-        mapped_rects.clear();

        pyramid_type pyr;
        for (unsigned long i = 0; i < rects.size(); ++i)
        {
-            // Figure out the pyramid level which best matches rects[i] against one of our 
-            // detection template object boxes.
-            unsigned long best_level = 0;
-            double match_score = std::numeric_limits<double>::infinity();
+            rectangle mapped_rect;
            detection_template best_template;
-
-            rectangle rect = rects[i];
-            const dlib::vector<double,2> p(rect.width(), rect.height());
-
-            // for all the levels
-            for (unsigned long l = 0; l < feats.size(); ++l)
-            {
-                // Run the center point through the feature/image space transformation just to make
-                // sure we exactly replicate the procedure for shifting an object_box used elsewhere 
-                // in this file.
-                const point origin = feats[l].feat_to_image_space(feats[l].image_to_feat_space(center(pyr.rect_down(rect,l))));
-
-                for (unsigned long t = 0; t < det_templates.size(); ++t)
-                {
-                    // Map this detection template into the normal image space and see how
-                    // close it is to the rect we are looking for.  We do the translation here
-                    // because the rect_up() routine takes place using integer arithmetic and
-                    // could potentially give slightly different results with and without the
-                    // translation.
-                    rectangle mapped_rect = translate_rect(det_templates[t].object_box, origin);
-                    mapped_rect = pyr.rect_up(mapped_rect, l);
-
-                    const dlib::vector<double,2> p2(mapped_rect.width(),
-                                                    mapped_rect.height());
-                    if ((p-p2).length() < match_score)
-                    {
-                        match_score = (p-p2).length();
-                        best_level = l;
-                        best_template = det_templates[t];
-                    }
-                }
-            }
-
-
-            // Now get the features out of feats[best_level].  But first translate best_template 
-            // into the right spot (it should be centered at the location determined by rects[i])
-            // and convert it into the feature image coordinate system.
-            rect = pyr.rect_down(rects[i],best_level);
-            const point offset = -feats[best_level].image_to_feat_space(point(0,0));
-            const point origin = feats[best_level].image_to_feat_space(center(rect)) + offset;
-            for (unsigned long k = 0; k < best_template.rects.size(); ++k)
-            {
-                rectangle temp = best_template.rects[k];
-                temp = feats[best_level].image_to_feat_space(temp);
-                temp = translate_rect(temp, origin);
-                temp = get_rect(feats[best_level]).intersect(temp);
-                best_template.rects[k] = temp;
-            }
-
-            // The input rectangle was mapped to one of the detection templates.  Reverse the process
-            // to figure out what the mapped rectangle is in the original input space.
-            rectangle mapped_rect = translate_rect(best_template.object_box, feats[best_level].feat_to_image_space(origin-offset));
-            mapped_rect = pyr.rect_up(mapped_rect, best_level);
-            mapped_rects.push_back(mapped_rect);
+            unsigned long best_level;
+            get_mapped_rect_and_metadata (rects[i], mapped_rect, best_template, best_level);

            for (unsigned long j = 0; j < best_template.rects.size(); ++j)
            {
-                rect = best_template.rects[j];
+                const rectangle rect = best_template.rects[j];
                const unsigned long template_region_id = j;
                const unsigned long offset = feats_config.get_num_dimensions()*template_region_id;
                for (long r = rect.top(); r <= rect.bottom(); ++r)
--- a/dlib/image_processing/scan_image_pyramid_abstract.h
+++ b/dlib/image_processing/scan_image_pyramid_abstract.h
@ -289,10 +289,22 @@ namespace dlib
                  been reached).
        !*/

+        const rectangle get_best_matching_rect (
+            const rectangle& rect
+        ) const;
+        /*!
+            requires
+                - is_loaded_with_image() == true
+                - get_num_detection_templates() > 0
+            ensures
+                - Since scan_image_pyramid is a sliding window classifier system, not all possible rectangles 
+                  can be represented.  Therefore, this function allows you to supply a rectangle and obtain the
+                  nearest possible sliding window rectangle.
+        !*/
+
        void get_feature_vector (
            const std::vector<rectangle>& rects,
-            feature_vector_type& psi,
-            std::vector<rectangle>& mapped_rects
+            feature_vector_type& psi
        ) const;
        /*!
            requires
@ -305,7 +317,6 @@ namespace dlib
                - if (rects was produced by a call to detect(), i.e. rects contains the contents of dets) then
                    - #psi == the sum of feature vectors corresponding to the sliding window locations contained
                      in rects.
-                    - #mapped_rects == rects
                    - Let w denote the w vector given to detect(), then we have:
                        - dot(w,#psi) == sum of scores of the dets produced by detect()
                - else
@ -313,8 +324,8 @@ namespace dlib
                      be output by detect().  So in the case where rects contains rectangles which could not arise
                      from a call to detect(), this function will map the rectangles in rects to the nearest possible 
                      object boxes and then store the sum of feature vectors for the mapped rectangles into #psi.
-                    - for all valid i: #mapped_rects[i] == the rectangle rects[i] gets mapped to for feature extraction.
-                - #mapped_rects.size() == rects.size()
+                    - for all valid i: get_best_matching_rect(rects[i]) == the rectangle rects[i] gets mapped to for 
+                      feature extraction.
        !*/

    };
--- a/dlib/svm/structural_svm_object_detection_problem.h
+++ b/dlib/svm/structural_svm_object_detection_problem.h
@ -156,7 +156,11 @@ namespace dlib
            scanner.load(images[idx]);
            psi.set_size(get_num_dimensions());
            std::vector<rectangle> mapped_rects;
-            scanner.get_feature_vector(truth_rects[idx], psi, mapped_rects);
+            scanner.get_feature_vector(truth_rects[idx], psi);
+            for (unsigned long i = 0; i < truth_rects[idx].size(); ++i)
+            {
+                mapped_rects.push_back(scanner.get_best_matching_rect(truth_rects[idx][i]));
+            }
            psi(scanner.get_num_dimensions()) = -1.0*truth_rects[idx].size();

            // check if any of the boxes overlap.  If they do then it is impossible for
@ -328,8 +332,7 @@ namespace dlib

            psi.set_size(get_num_dimensions());
            psi = 0;
-            std::vector<rectangle> mapped_rects;
-            scanner.get_feature_vector(final_dets, psi, mapped_rects);
+            scanner.get_feature_vector(final_dets, psi);

            psi(scanner.get_num_dimensions()) = -1.0*final_dets.size();
        }