Added evaluate_detectors() to make it easy to run a bunch of HOG detectors

efficiently, even when their window sizes differ.
This commit is contained in:
Davis King 2014-05-26 18:46:13 -04:00
parent 09af3eb856
commit 4f275bd7bd
2 changed files with 378 additions and 121 deletions

View File

@ -226,7 +226,6 @@ namespace dlib
return num;
}
private:
std::vector<matrix<float> > filters;
std::vector<std::vector<matrix<float,0,1> > > row_filters, col_filters;
};
@ -361,14 +360,6 @@ namespace dlib
height = temp.height();
}
static bool compare_pair_rect (
const std::pair<double, rectangle>& a,
const std::pair<double, rectangle>& b
)
{
return a.first < b.first;
}
void get_mapped_rect_and_metadata (
const unsigned long number_pyramid_levels,
const rectangle& rect,
@ -389,12 +380,6 @@ namespace dlib
typedef array<array2d<float> > fhog_image;
static rectangle apply_filters_to_fhog (
const fhog_filterbank& w,
const fhog_image& feats,
array2d<float>& saliency_image
);
feature_extractor_type fe;
array<fhog_image> feats;
int cell_size;
@ -422,54 +407,56 @@ namespace dlib
// ----------------------------------------------------------------------------------------
template <typename T, typename U>
rectangle scan_fhog_pyramid<T,U>::
apply_filters_to_fhog (
const fhog_filterbank& w,
const fhog_image& feats,
array2d<float>& saliency_image
)
namespace impl
{
const unsigned long num_separable_filters = w.num_separable_filters();
rectangle area;
// use the separable filters if they would be faster than running the regular filters.
if (num_separable_filters > w.filters.size()*std::min(w.filters[0].nr(),w.filters[0].nc())/3.0)
template <typename fhog_filterbank>
rectangle apply_filters_to_fhog (
const fhog_filterbank& w,
const array<array2d<float> >& feats,
array2d<float>& saliency_image
)
{
area = spatially_filter_image(feats[0], saliency_image, w.filters[0]);
for (unsigned long i = 1; i < w.filters.size(); ++i)
const unsigned long num_separable_filters = w.num_separable_filters();
rectangle area;
// use the separable filters if they would be faster than running the regular filters.
if (num_separable_filters > w.filters.size()*std::min(w.filters[0].nr(),w.filters[0].nc())/3.0)
{
// now we filter but the output adds to saliency_image rather than
// overwriting it.
spatially_filter_image(feats[i], saliency_image, w.filters[i], 1, false, true);
}
}
else
{
saliency_image.clear();
array2d<float> scratch;
// find the first filter to apply
unsigned long i = 0;
while (i < w.row_filters.size() && w.row_filters[i].size() == 0)
++i;
for (; i < w.row_filters.size(); ++i)
{
for (unsigned long j = 0; j < w.row_filters[i].size(); ++j)
area = spatially_filter_image(feats[0], saliency_image, w.filters[0]);
for (unsigned long i = 1; i < w.filters.size(); ++i)
{
if (saliency_image.size() == 0)
area = float_spatially_filter_image_separable(feats[i], saliency_image, w.row_filters[i][j], w.col_filters[i][j],scratch,false);
else
area = float_spatially_filter_image_separable(feats[i], saliency_image, w.row_filters[i][j], w.col_filters[i][j],scratch,true);
// now we filter but the output adds to saliency_image rather than
// overwriting it.
spatially_filter_image(feats[i], saliency_image, w.filters[i], 1, false, true);
}
}
if (saliency_image.size() == 0)
else
{
saliency_image.set_size(feats[0].nr(), feats[0].nc());
assign_all_pixels(saliency_image, 0);
saliency_image.clear();
array2d<float> scratch;
// find the first filter to apply
unsigned long i = 0;
while (i < w.row_filters.size() && w.row_filters[i].size() == 0)
++i;
for (; i < w.row_filters.size(); ++i)
{
for (unsigned long j = 0; j < w.row_filters[i].size(); ++j)
{
if (saliency_image.size() == 0)
area = float_spatially_filter_image_separable(feats[i], saliency_image, w.row_filters[i][j], w.col_filters[i][j],scratch,false);
else
area = float_spatially_filter_image_separable(feats[i], saliency_image, w.row_filters[i][j], w.col_filters[i][j],scratch,true);
}
}
if (saliency_image.size() == 0)
{
saliency_image.set_size(feats[0].nr(), feats[0].nc());
assign_all_pixels(saliency_image, 0);
}
}
return area;
}
return area;
}
// ----------------------------------------------------------------------------------------
@ -561,6 +548,70 @@ namespace dlib
fe = fe_;
}
// ----------------------------------------------------------------------------------------
namespace impl
{
template <
typename pyramid_type,
typename image_type,
typename feature_extractor_type
>
void create_fhog_pyramid (
const image_type& img,
const feature_extractor_type& fe,
array<array<array2d<float> > >& feats,
int cell_size,
int filter_rows_padding,
int filter_cols_padding,
unsigned long min_pyramid_layer_width,
unsigned long min_pyramid_layer_height,
unsigned long max_pyramid_levels
)
{
unsigned long levels = 0;
rectangle rect = get_rect(img);
// figure out how many pyramid levels we should be using based on the image size
pyramid_type pyr;
do
{
rect = pyr.rect_down(rect);
++levels;
} while (rect.width() >= min_pyramid_layer_width && rect.height() >= min_pyramid_layer_height &&
levels < max_pyramid_levels);
if (feats.max_size() < levels)
feats.set_max_size(levels);
feats.set_size(levels);
typedef typename image_type::type pixel_type;
typedef typename image_type::mem_manager_type mem_manager_type;
// build our feature pyramid
fe(img, feats[0], cell_size,filter_rows_padding,filter_cols_padding);
DLIB_ASSERT(feats[0].size() == fe.get_num_planes(),
"Invalid feature extractor used with dlib::scan_fhog_pyramid. The output does not have the \n"
"indicated number of planes.");
if (feats.size() > 1)
{
array2d<pixel_type,mem_manager_type> temp1, temp2;
pyr(img, temp1);
fe(temp1, feats[1], cell_size,filter_rows_padding,filter_cols_padding);
swap(temp1,temp2);
for (unsigned long i = 2; i < feats.size(); ++i)
{
pyr(temp2, temp1);
fe(temp1, feats[i], cell_size,filter_rows_padding,filter_cols_padding);
swap(temp1,temp2);
}
}
}
}
// ----------------------------------------------------------------------------------------
template <
@ -575,48 +626,11 @@ namespace dlib
const image_type& img
)
{
unsigned long levels = 0;
rectangle rect = get_rect(img);
// figure out how many pyramid levels we should be using based on the image size
pyramid_type pyr;
do
{
rect = pyr.rect_down(rect);
++levels;
} while (rect.width() >= min_pyramid_layer_width && rect.height() >= min_pyramid_layer_height &&
levels < max_pyramid_levels);
if (feats.max_size() < levels)
feats.set_max_size(levels);
feats.set_size(levels);
unsigned long width, height;
compute_fhog_window_size(width,height);
typedef typename image_type::type pixel_type;
typedef typename image_type::mem_manager_type mem_manager_type;
// build our feature pyramid
fe(img, feats[0], cell_size,height,width);
DLIB_ASSERT(feats[0].size() == fe.get_num_planes(),
"Invalid feature extractor used with dlib::scan_fhog_pyramid. The output does not have the \n"
"indicated number of planes.");
if (feats.size() > 1)
{
array2d<pixel_type,mem_manager_type> temp1, temp2;
pyr(img, temp1);
fe(temp1, feats[1], cell_size,height,width);
swap(temp1,temp2);
for (unsigned long i = 2; i < feats.size(); ++i)
{
pyr(temp2, temp1);
fe(temp1, feats[i], cell_size,height,width);
swap(temp1,temp2);
}
}
impl::create_fhog_pyramid<Pyramid_type>(img, fe, feats, cell_size, height,
width, min_pyramid_layer_width, min_pyramid_layer_height,
max_pyramid_levels);
}
// ----------------------------------------------------------------------------------------
@ -730,6 +744,82 @@ namespace dlib
max_pyramid_levels = max_levels;
}
// ----------------------------------------------------------------------------------------
namespace impl
{
inline bool compare_pair_rect (
const std::pair<double, rectangle>& a,
const std::pair<double, rectangle>& b
)
{
return a.first < b.first;
}
template <
typename pyramid_type,
typename feature_extractor_type,
typename fhog_filterbank
>
void detect_from_fhog_pyramid (
const array<array<array2d<float> > >& feats,
const feature_extractor_type& fe,
const fhog_filterbank& w,
const double thresh,
const unsigned long det_box_height,
const unsigned long det_box_width,
const int cell_size,
const int filter_rows_padding,
const int filter_cols_padding,
std::vector<std::pair<double, rectangle> >& dets
)
{
dets.clear();
array2d<float> saliency_image;
pyramid_type pyr;
// for all pyramid levels
for (unsigned long l = 0; l < feats.size(); ++l)
{
const rectangle area = apply_filters_to_fhog(w, feats[l], saliency_image);
// now search the saliency image for any detections
for (long r = area.top(); r <= area.bottom(); ++r)
{
for (long c = area.left(); c <= area.right(); ++c)
{
// if we found a detection
if (saliency_image[r][c] >= thresh)
{
rectangle rect = fe.feats_to_image(centered_rect(point(c,r),det_box_width,det_box_height),
cell_size, filter_rows_padding, filter_cols_padding);
rect = pyr.rect_up(rect, l);
dets.push_back(std::make_pair(saliency_image[r][c], rect));
}
}
}
}
std::sort(dets.rbegin(), dets.rend(), compare_pair_rect);
}
inline bool overlaps_any_box (
const test_box_overlap& tester,
const std::vector<rect_detection>& rects,
const rect_detection& rect
)
{
for (unsigned long i = 0; i < rects.size(); ++i)
{
if (tester(rects[i].rect, rect.rect))
return true;
}
return false;
}
}
// ----------------------------------------------------------------------------------------
template <
@ -754,36 +844,11 @@ namespace dlib
<< "\n\t this: " << this
);
dets.clear();
unsigned long width, height;
compute_fhog_window_size(width,height);
array2d<float> saliency_image;
pyramid_type pyr;
// for all pyramid levels
for (unsigned long l = 0; l < feats.size(); ++l)
{
const rectangle area = apply_filters_to_fhog(w, feats[l], saliency_image);
// now search the saliency image for any detections
for (long r = area.top(); r <= area.bottom(); ++r)
{
for (long c = area.left(); c <= area.right(); ++c)
{
// if we found a detection
if (saliency_image[r][c] >= thresh)
{
rectangle rect = fe.feats_to_image(centered_rect(point(c,r),width-2*padding,height-2*padding), cell_size, height,width);
rect = pyr.rect_up(rect, l);
dets.push_back(std::make_pair(saliency_image[r][c], rect));
}
}
}
}
std::sort(dets.rbegin(), dets.rend(), compare_pair_rect);
impl::detect_from_fhog_pyramid<pyramid_type>(feats, fe, w, thresh,
height-2*padding, width-2*padding, cell_size, height, width, dets);
}
// ----------------------------------------------------------------------------------------
@ -1145,6 +1210,134 @@ namespace dlib
};
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
template <
typename pyramid_type,
typename image_type
>
void evaluate_detectors (
const std::vector<object_detector<scan_fhog_pyramid<pyramid_type> > >& detectors,
const image_type& img,
std::vector<rect_detection>& dets,
const double adjust_threshold = 0
)
{
typedef scan_fhog_pyramid<pyramid_type> scanner_type;
dets.clear();
if (detectors.size() == 0)
return;
const int cell_size = detectors[0].get_scanner().get_cell_size();
// Find the maximum sized filters and also most extreme pyramiding settings used.
unsigned long max_filter_width = 0;
unsigned long max_filter_height = 0;
unsigned long min_pyramid_layer_width = std::numeric_limits<unsigned long>::max();
unsigned long min_pyramid_layer_height = std::numeric_limits<unsigned long>::max();
unsigned long max_pyramid_levels = 0;
bool all_cell_sizes_the_same = true;
for (unsigned long i = 0; i < detectors.size(); ++i)
{
const scanner_type& scanner = detectors[i].get_scanner();
max_filter_width = std::max(max_filter_width, scanner.get_fhog_window_width());
max_filter_height = std::max(max_filter_height, scanner.get_fhog_window_height());
max_pyramid_levels = std::max(max_pyramid_levels, scanner.get_max_pyramid_levels());
min_pyramid_layer_width = std::min(min_pyramid_layer_width, scanner.get_min_pyramid_layer_width());
min_pyramid_layer_height = std::min(min_pyramid_layer_height, scanner.get_min_pyramid_layer_height());
if (cell_size != scanner.get_cell_size())
all_cell_sizes_the_same = false;
}
std::vector<rect_detection> dets_accum;
// Do to the HOG feature extraction to make the fhog pyramid. Again, note that we
// are making a pyramid that will work with any of the detectors. But only if all
// the cell sizes are the same. If they aren't then we have to calculate the
// pyramid for each detector individually.
array<array<array2d<float> > > feats;
if (all_cell_sizes_the_same)
{
impl::create_fhog_pyramid<pyramid_type>(img,
detectors[0].get_scanner().get_feature_extractor(), feats, cell_size,
max_filter_height, max_filter_width, min_pyramid_layer_width,
min_pyramid_layer_height, max_pyramid_levels);
}
std::vector<std::pair<double, rectangle> > temp_dets;
for (unsigned long i = 0; i < detectors.size(); ++i)
{
const scanner_type& scanner = detectors[i].get_scanner();
if (!all_cell_sizes_the_same)
{
impl::create_fhog_pyramid<pyramid_type>(img,
scanner.get_feature_extractor(), feats, scanner.get_cell_size(),
max_filter_height, max_filter_width, min_pyramid_layer_width,
min_pyramid_layer_height, max_pyramid_levels);
}
const unsigned long det_box_width = scanner.get_fhog_window_width() - 2*scanner.get_padding();
const unsigned long det_box_height = scanner.get_fhog_window_height() - 2*scanner.get_padding();
// A single detector object might itself have multiple weight vectors in it. So
// we need to evaluate all of them.
for (unsigned d = 0; d < detectors[i].num_detectors(); ++d)
{
const double thresh = detectors[i].get_processed_w(d).w(scanner.get_num_dimensions());
impl::detect_from_fhog_pyramid<pyramid_type>(feats, scanner.get_feature_extractor(),
detectors[i].get_processed_w(d).get_detect_argument(), thresh,
det_box_height, det_box_width, cell_size, max_filter_height,
max_filter_width, temp_dets);
for (unsigned long j = 0; j < temp_dets.size(); ++j)
{
rect_detection temp;
temp.detection_confidence = temp_dets[j].first-thresh;
temp.weight_index = i;
temp.rect = temp_dets[j].second;
dets_accum.push_back(temp);
}
}
}
// Do non-max suppression
dets.clear();
if (detectors.size() > 1)
std::sort(dets_accum.rbegin(), dets_accum.rend());
for (unsigned long i = 0; i < dets_accum.size(); ++i)
{
const test_box_overlap tester = detectors[dets_accum[i].weight_index].get_overlap_tester();
if (impl::overlaps_any_box(tester, dets, dets_accum[i]))
continue;
dets.push_back(dets_accum[i]);
}
}
// ----------------------------------------------------------------------------------------
template <
typename Pyramid_type,
typename image_type
>
std::vector<rectangle> evaluate_detectors (
const std::vector<object_detector<scan_fhog_pyramid<Pyramid_type> > >& detectors,
const image_type& img,
const double adjust_threshold = 0
)
{
std::vector<rectangle> out_dets;
std::vector<rect_detection> dets;
evaluate_detectors(detectors, img, dets, adjust_threshold);
out_dets.reserve(dets.size());
for (unsigned long i = 0; i < dets.size(); ++i)
out_dets.push_back(dets[i].rect);
return out_dets;
}
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
}

View File

@ -693,6 +693,70 @@ namespace dlib
provides deserialization support
!*/
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
template <
typename pyramid_type,
typename image_type
>
void evaluate_detectors (
const std::vector<object_detector<scan_fhog_pyramid<pyramid_type>>>& detectors,
const image_type& img,
std::vector<rect_detection>& dets,
const double adjust_threshold = 0
);
/*!
ensures
- This function runs each of the provided object_detector objects over img and
stores the resulting detections into #dets. Importantly, this function is
faster than running each detector individually because it computes the HOG
features only once and then reuses them for each detector. However, it is
important to note that this speedup is only possible if all the detectors use
the same cell_size parameter that determines how HOG features are computed.
If different cell_size values are used then this function will not be any
faster than running the detectors individually.
- This function applies non-max suppression to the outputs from all detectors
and therefore none of the outputs will overlap with each other.
- To be precise, this function performs object detection on the given image and
stores the detected objects into #dets. In particular, we will have that:
- #dets is sorted such that the highest confidence detections come first.
E.g. element 0 is the best detection, element 1 the next best, and so on.
- #dets.size() == the number of detected objects.
- #dets[i].detection_confidence == The strength of the i-th detection.
Larger values indicate that the detector is more confident that #dets[i]
is a correct detection rather than being a false alarm. Moreover, the
detection_confidence is equal to the detection value output by the
scanner minus the threshold value stored at the end of the weight vector.
- #dets[i].rect == the bounding box for the i-th detection.
- The detection #dets[i].rect was produced by detectors[#dets[i].weight_index].
- The detection threshold is adjusted by having adjust_threshold added to it.
Therefore, an adjust_threshold value > 0 makes detecting objects harder while
a negative value makes it easier. Moreover, the following will be true for
all valid i:
- #dets[i].detection_confidence >= adjust_threshold
This means that, for example, you can obtain the maximum possible number of
detections by setting adjust_threshold equal to negative infinity.
!*/
// ----------------------------------------------------------------------------------------
template <
typename pyramid_type,
typename image_type
>
std::vector<rectangle> evaluate_detectors (
const std::vector<object_detector<scan_fhog_pyramid<pyramid_type>>>& detectors,
const image_type& img,
const double adjust_threshold = 0
);
/*!
ensures
- This function just calls the above evaluate_detectors() routine and copies
the output dets into a vector<rectangle> object and returns it. Therefore,
this function is provided for convenience.
!*/
// ----------------------------------------------------------------------------------------
}