#include #include #include #include #include #include #include #include using namespace std; using namespace dlib; // the dnn rear view vehicle detector network template using con5d = con; template using con5 = con; template using downsampler = relu>>>>>>>>; template using rcon5 = relu>>; using net_type = loss_mmod>>>>>>>; // ---------------------------------------------------------------------------------------- int main() try { net_type net; shape_predictor sp; // You can get this file from http://dlib.net/files/mmod_rear_end_vehicle_detector.dat.bz2 // This network was produced by the dnn_mmod_train_find_cars_ex.cpp example program. // As you can see, it also includes a shape_predictor. To see a generic example of how // to train those refer to train_shape_predictor_ex.cpp. deserialize("mmod_rear_end_vehicle_detector.dat") >> net >> sp; matrix img; load_image(img, "../mmod_cars_test_image.jpg"); image_window win; win.set_image(img); // Run the detector on the image and show us the output. for (auto&& d : net(img)) { // We use a shape_predictor to refine the exact shape and location of the detection // box. This shape_predictor is trained to simply output the 4 corner points. So // all we do is make a rectangle that tightly contains those 4 points and that // rectangle is our refined detection position. auto fd = sp(img,d); rectangle rect; for (long j = 0; j < fd.num_parts(); ++j) rect += fd.part(j); win.add_overlay(rect, rgb_pixel(255,0,0)); } cout << "Hit enter to view the intermediate processing steps" << endl; cin.get(); // Create a tiled image pyramid and display it on the screen. std::vector rects; matrix tiled_img; create_tiled_pyramid::type::pyramid_type>(img, tiled_img, rects, input_layer(net).get_pyramid_padding(), input_layer(net).get_pyramid_outer_padding()); image_window winpyr(tiled_img, "Tiled image pyramid"); cout << "Number of channels in final tensor image: " << net.subnet().get_output().k() << endl; matrix network_output = image_plane(net.subnet().get_output(),0,0); for (long k = 1; k < net.subnet().get_output().k(); ++k) network_output = max_pointwise(network_output, image_plane(net.subnet().get_output(),0,k)); const double v0_scale = img.nc()/(double)network_output.nc(); resize_image(v0_scale, network_output); const float lower = -2.5;// min(network_output); const float upper = 0.0;// max(network_output); cout << "jet color mapping range: lower="<< lower << " upper="<< upper << endl; // Display the final layer as a color image image_window win_output(jet(network_output, upper, lower), "Output tensor from the network"); // Overlay network_output on top of the tiled image pyramid and display it. matrix tiled_img_sal = tiled_img; for (long r = 0; r < tiled_img_sal.nr(); ++r) { for (long c = 0; c < tiled_img_sal.nc(); ++c) { dpoint tmp(c,r); tmp = input_tensor_to_output_tensor(net, tmp); tmp = point(v0_scale*tmp); if (get_rect(network_output).contains(tmp)) { float val = network_output(tmp.y(),tmp.x()); rgb_alpha_pixel p; assign_pixel(p , colormap_jet(val,lower,upper)); p.alpha = 120; assign_pixel(tiled_img_sal(r,c), p); } } } image_window win_pyr_sal(tiled_img_sal, "Saliency on image pyramid"); // Now collapse the pyramid scales into the original image matrix collapsed_saliency(img.nr(), img.nc()); resizable_tensor input_tensor; input_layer(net).to_tensor(&img, &img+1, input_tensor); for (long r = 0; r < collapsed_saliency.nr(); ++r) { for (long c = 0; c < collapsed_saliency.nc(); ++c) { // Loop over a bunch of scale values and look up what part of network_output corresponds to // the point(c,r) in the original image, then take the max saliency value over // all the scales and save it at pixel point(c,r). float max_sal = -1e30; for (double scale = 1; scale > 0.2; scale *= 5.0/6.0) { // map from input image coordinates to tiled pyramid and then to output // tensor coordinates. dpoint tmp = center(input_layer(net).image_space_to_tensor_space(input_tensor,scale, drectangle(dpoint(c,r)))); tmp = point(v0_scale*input_tensor_to_output_tensor(net, tmp)); if (get_rect(network_output).contains(tmp)) { float val = network_output(tmp.y(),tmp.x()); if (val > max_sal) max_sal = val; } } collapsed_saliency(r,c) = max_sal; // Also blend the saliency into the original input image so we can view it as // an overlay on the cars. rgb_alpha_pixel p; assign_pixel(p , colormap_jet(max_sal,lower,upper)); p.alpha = 120; assign_pixel(img(r,c), p); } } image_window win_collapsed(jet(collapsed_saliency, upper, lower), "collapsed saliency map"); image_window win_img_and_sal(img); cout << "Hit enter to end program" << endl; cin.get(); } catch(image_load_error& e) { cout << e.what() << endl; cout << "The test image is located in the examples folder. So you should run this program from a sub folder so that the relative path is correct." << endl; } catch(serialization_error& e) { cout << e.what() << endl; cout << "The model file can be obtained from: http://dlib.net/files/mmod_rear_end_vehicle_detector.dat.bz2 Don't forget to unzip the file." << endl; } catch(std::exception& e) { cout << e.what() << endl; }