diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index ebc46d038..0b0681616 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -121,6 +121,8 @@ if (NOT USING_OLD_VISUAL_STUDIO_COMPILER) add_gui_example(random_cropper_ex) add_gui_example(dnn_mmod_dog_hipsterizer) add_gui_example(dnn_imagenet_ex) + add_gui_example(dnn_mmod_find_cars_ex) + add_example(dnn_mmod_train_find_cars_ex) if (NOT MSVC) # Don't try to compile this program using Visual Studio since it causes the # compiler to run out of RAM and to crash. Maybe someday Visual Studio diff --git a/examples/dnn_mmod_find_cars_ex.cpp b/examples/dnn_mmod_find_cars_ex.cpp new file mode 100644 index 000000000..dfb4bcd8e --- /dev/null +++ b/examples/dnn_mmod_find_cars_ex.cpp @@ -0,0 +1,175 @@ + + +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace std; +using namespace dlib; + + + +// the dnn rear view vehicle detector network +template using con5d = con; +template using con5 = con; +template using downsampler = relu>>>>>>>>; +template using rcon5 = relu>>; +using net_type = loss_mmod>>>>>>>; + +// ---------------------------------------------------------------------------------------- + +int main() try +{ + net_type net; + shape_predictor sp; + // You can get this file from http://dlib.net/files/mmod_rear_end_vehicle_detector.dat.bz2 + // This network was produced by the dnn_mmod_train_find_cars_ex.cpp example program. + // As you can see, it also includes a shape_predictor. To see a generic example of how + // to train those refer to train_shape_predictor_ex.cpp. + deserialize("mmod_rear_end_vehicle_detector.dat") >> net >> sp; + + matrix img; + load_image(img, "../mmod_cars_test_image.jpg"); + + image_window win; + win.set_image(img); + + // Run the detector on the image and show us the output. + for (auto&& d : net(img)) + { + // We use a shape_predictor to refine the exact shape and location of the detection + // box. This shape_predictor is trained to simply output the 4 corner points. So + // all we do is make a rectangle that tightly contains those 4 points and that + // rectangle is our refined detection position. + auto fd = sp(img,d); + rectangle rect; + for (long j = 0; j < fd.num_parts(); ++j) + rect += fd.part(j); + win.add_overlay(rect, rgb_pixel(255,0,0)); + } + + + + cout << "Hit enter to view the intermediate processing steps" << endl; + cin.get(); + + + + // Create a tiled image pyramid and display it on the screen. + std::vector rects; + matrix tiled_img; + create_tiled_pyramid::type::pyramid_type>(img, + tiled_img, rects, input_layer(net).get_pyramid_padding(), + input_layer(net).get_pyramid_outer_padding()); + image_window winpyr(tiled_img, "Tiled image pyramid"); + + + + cout << "Number of channels in final tensor image: " << net.subnet().get_output().k() << endl; + matrix network_output = image_plane(net.subnet().get_output(),0,0); + for (long k = 1; k < net.subnet().get_output().k(); ++k) + network_output = max_pointwise(network_output, image_plane(net.subnet().get_output(),0,k)); + const double v0_scale = img.nc()/(double)network_output.nc(); + resize_image(v0_scale, network_output); + + + const float lower = -2.5;// min(network_output); + const float upper = 0.0;// max(network_output); + cout << "jet color mapping range: lower="<< lower << " upper="<< upper << endl; + + // Display the final layer as a color image + image_window win_output(jet(network_output, upper, lower), "Output tensor from the network"); + + + + // Overlay network_output on top of the tiled image pyramid and display it. + matrix tiled_img_sal = tiled_img; + for (long r = 0; r < tiled_img_sal.nr(); ++r) + { + for (long c = 0; c < tiled_img_sal.nc(); ++c) + { + dpoint tmp(c,r); + tmp = input_tensor_to_output_tensor(net, tmp); + tmp = point(v0_scale*tmp); + if (get_rect(network_output).contains(tmp)) + { + float val = network_output(tmp.y(),tmp.x()); + rgb_alpha_pixel p; + assign_pixel(p , colormap_jet(val,lower,upper)); + p.alpha = 120; + assign_pixel(tiled_img_sal(r,c), p); + } + } + } + image_window win_pyr_sal(tiled_img_sal, "Saliency on image pyramid"); + + + + + // Now collapse the pyramid scales into the original image + matrix collapsed_saliency(img.nr(), img.nc()); + resizable_tensor input_tensor; + input_layer(net).to_tensor(&img, &img+1, input_tensor); + for (long r = 0; r < collapsed_saliency.nr(); ++r) + { + for (long c = 0; c < collapsed_saliency.nc(); ++c) + { + // Loop over a bunch of scale values and look up what part of network_output corresponds to + // the point(c,r) in the original image, then take the max saliency value over + // all the scales and save it at pixel point(c,r). + float max_sal = -1e30; + for (double scale = 1; scale > 0.2; scale *= 5.0/6.0) + { + // map from input image coordinates to tiled pyramid and then to output + // tensor coordinates. + dpoint tmp = center(input_layer(net).image_space_to_tensor_space(input_tensor,scale, drectangle(dpoint(c,r)))); + tmp = point(v0_scale*input_tensor_to_output_tensor(net, tmp)); + if (get_rect(network_output).contains(tmp)) + { + float val = network_output(tmp.y(),tmp.x()); + if (val > max_sal) + max_sal = val; + } + } + + collapsed_saliency(r,c) = max_sal; + + // Also blend the saliency into the original input image so we can view it as + // an overlay on the cars. + rgb_alpha_pixel p; + assign_pixel(p , colormap_jet(max_sal,lower,upper)); + p.alpha = 120; + assign_pixel(img(r,c), p); + } + } + + image_window win_collapsed(jet(collapsed_saliency, upper, lower), "collapsed saliency map"); + image_window win_img_and_sal(img); + + + cout << "Hit enter to end program" << endl; + cin.get(); +} +catch(image_load_error& e) +{ + cout << e.what() << endl; + cout << "The test image is located in the examples folder. So you should run this program from a sub folder so that the relative path is correct." << endl; +} +catch(serialization_error& e) +{ + cout << e.what() << endl; + cout << "The model file can be obtained from: http://dlib.net/files/mmod_rear_end_vehicle_detector.dat.bz2 Don't forget to unzip the file." << endl; +} +catch(std::exception& e) +{ + cout << e.what() << endl; +} + + + + diff --git a/examples/dnn_mmod_train_find_cars_ex.cpp b/examples/dnn_mmod_train_find_cars_ex.cpp new file mode 100644 index 000000000..6bfcf9b76 --- /dev/null +++ b/examples/dnn_mmod_train_find_cars_ex.cpp @@ -0,0 +1,348 @@ +// The contents of this file are in the public domain. See LICENSE_FOR_EXAMPLE_PROGRAMS.txt +/* + This example shows how to train a CNN based object detector using dlib's + loss_mmod loss layer. This loss layer implements the Max-Margin Object + Detection loss as described in the paper: + Max-Margin Object Detection by Davis E. King (http://arxiv.org/abs/1502.00046). + This is the same loss used by the popular SVM+HOG object detector in dlib + (see fhog_object_detector_ex.cpp) except here we replace the HOG features + with a CNN and train the entire detector end-to-end. This allows us to make + much more powerful detectors. + + It would be a good idea to become familiar with dlib's DNN tooling before + reading this example. So you should read dnn_introduction_ex.cpp and + dnn_introduction2_ex.cpp before reading this example program. You should also read the + DNN+MMOD example as well: dnn_mmod_ex.cpp + + + This example is essentially a more complex version of dnn_mmod_ex.cpp. In it we train + a detector that finds the rear ends of motor vehicles. I will also discuss some + aspects of data preparation useful when training this kind of detector. + +*/ + + +#include +#include +#include +#include +#include + +using namespace std; +using namespace dlib; + + + +// the dnn vehicle detector network +template using con5d = con; +template using con5 = con; +template using downsampler = relu>>>>>>>>; +template using rcon5 = relu>>; +using net_type = loss_mmod>>>>>>>; + + +// ---------------------------------------------------------------------------------------- + +int ignore_overlapped_boxes( + std::vector& boxes, + const test_box_overlap& overlaps +) +{ + int num_ignored = 0; + for (size_t i = 0; i < boxes.size(); ++i) + { + if (boxes[i].ignore) + continue; + for (size_t j = i+1; j < boxes.size(); ++j) + { + if (boxes[j].ignore) + continue; + if (overlaps(boxes[i], boxes[j])) + { + ++num_ignored; + if(boxes[i].rect.area() < boxes[j].rect.area()) + boxes[i].ignore = true; + else + boxes[j].ignore = true; + } + } + } + return num_ignored; +} + +// ---------------------------------------------------------------------------------------- + +template < + typename pyramid_type, + typename image_array_type + > +void upsample_image_dataset_limit ( + image_array_type& images, + std::vector>& objects +) +{ + // make sure requires clause is not broken + DLIB_ASSERT( images.size() == objects.size(), + "\t void upsample_image_dataset_limit()" + << "\n\t Invalid inputs were given to this function." + << "\n\t images.size(): " << images.size() + << "\n\t objects.size(): " << objects.size() + ); + + typename image_array_type::value_type temp; + pyramid_type pyr; + for (unsigned long i = 0; i < images.size(); ++i) + { + if (images[i].size() < 1800*1800) + { + pyramid_up(images[i], temp, pyr); + swap(temp, images[i]); + for (unsigned long j = 0; j < objects[i].size(); ++j) + { + objects[i][j].rect = pyr.rect_up(objects[i][j].rect); + } + } + } +} + +// ---------------------------------------------------------------------------------------- + +int main(int argc, char** argv) try +{ + if (argc != 2) + { + cout << "Give the path to a folder containing training.xml and testing.xml files." << endl; + cout << "This example program is specifically designed to run on the dlib vehicle " << endl; + cout << "detection dataset, which is available at this URL: " << endl; + cout << " http://dlib.net/files/data/dlib_rear_end_vehicles_v1.tar" << endl; + cout << endl; + cout << "So download that dataset, extract it somewhere, and then run this program" << endl; + cout << "with the dlib_rear_end_vehicles folder as an argument. E.g. if you extract" << endl; + cout << "the dataset to the current folder then you should run this example program" << endl; + cout << "by typing: " << endl; + cout << " ./dnn_mmod_train_find_cars_ex dlib_rear_end_vehicles" << endl; + cout << endl; + return 0; + } + const std::string data_directory = argv[1]; + + + std::vector> images_train, images_test; + std::vector> boxes_train, boxes_test; + load_image_dataset(images_train, boxes_train, data_directory+"/training.xml"); + load_image_dataset(images_test, boxes_test, data_directory+"/testing.xml"); + + + int num_overlapped_ignored_test = 0; + for (auto& v : boxes_test) + num_overlapped_ignored_test += ignore_overlapped_boxes(v, test_box_overlap(0.50, 0.99)); + + int num_overlapped_ignored = 0; + int num_additional_ignored = 0; + for (auto& v : boxes_train) + { + num_overlapped_ignored += ignore_overlapped_boxes(v, test_box_overlap(0.50, 0.99)); + for (auto& bb : v) + { + if (bb.rect.width() < 35 && bb.rect.height() < 35) + { + if (!bb.ignore) + { + bb.ignore = true; + ++num_additional_ignored; + } + } + + // The dlib vehicle detection dataset doesn't contain any detections with + // really extreme aspect ratios. However, some datasets do, often because of + // bad labeling. So it's a good idea to check for that and either eliminate + // those boxes or set them to ignore. Although, this depends on your + // application. + // + // For instance, if your dataset has boxes with an aspect ratio + // of 10 then you should think about what that means for the network + // architecture. Does the receptive field even cover the entirety of the box + // in those cases? Do you care about these boxes? Are they labeling errors? + // I find that many people will download some dataset from the internet and + // just take it as given. They run it through some training algorithm and take + // the dataset as unchallengeable truth. But many datasets are full of + // labeling errors. There are also a lot of datasets that aren't full of + // errors, but are annotated in a sloppy and inconsistent way. Fixing those + // errors and inconsistencies can often greatly improve models trained from + // such data. It's almost always worth the time to try and improve your + // training dataset. + } + } + + cout << "num_overlapped_ignored: "<< num_overlapped_ignored << endl; + cout << "num_additional_ignored: "<< num_additional_ignored << endl; + cout << "num_overlapped_ignored_test: "<< num_overlapped_ignored_test << endl; + + + cout << "num training images: " << images_train.size() << endl; + cout << "num testing images: " << images_test.size() << endl; + + + // Our vehicle detection dataset has basically 3 different types of boxes. Square + // boxes, tall and skinny boxes (e.g. semi trucks), and short and wide boxes (e.g. + // sedans). Here we are telling the MMOD algorithm that a vehicle is recognizable as + // long as the longest box side is at least 70 pixels long and the shortest box side is + // at least 30 pixels long. It will use these parameters to decide how large each of + // the sliding windows need to be so as to be able to detect all the vehicles. Since + // our dataset has basically only these 3 different aspect ratios, it will decide to + // use 3 different sliding windows at the end of the network. + mmod_options options(boxes_train, 70, 30); + + // This setting is very important and dataset specific. The vehicle detection dataset + // contains boxes that are marked as "ignore", as we discussed above. Some of them are + // ignored because we set ignore to true on them in the above code. However, the xml + // files already contained a lot of ignore boxes. Some of them are large boxes that + // encompass large parts of an image and the intention is to have everything inside + // those boxes be ignored. Therefore, we need to tell the MMOD algorithm to do that, + // which we do by setting options.overlaps_ignore appropriately. + // + // But first, we need to understand exactly what this option does. The MMOD loss + // is essentially counting the number of false alarms + missed detections, produced by + // the detector, for each image. During training, the code is running the detector on + // each image in a mini-batch and looking at it's output and counting the number of + // mistakes. The optimizer tries to find parameters settings that minimize the number + // of detector mistakes. + // + // This overlaps_ignore option allows you to tell the loss that some outputs from the + // detector should be totally ignored, as if they never happened. In particular, if a + // detection overlaps a box in the training data with ignore==true then that detection + // is ignored. This overlap is determined by calling + // options.overlaps_ignore(the_detection, the_ignored_training_box). If it returns + // true then that detection is ignored. + // + // You should read the documentation for test_box_overlap, the class type for + // overlaps_ignore for full details. However, the gist is that the default behavior is + // to only consider boxes as overlapping if their intersection over union is > 0.5. + // However, the dlib vehicle detection dataset contains large boxes that are meant to + // mask out large areas of an image. So intersection over union isn't an appropriate + // way to measure "overlaps with box" in this case. We want any box that is contained + // inside one of these big regions to be ignored, even if the detection box is really + // small. So we set overlaps_ignore to behave that way with this line. + options.overlaps_ignore = test_box_overlap(0.5, 0.95); + + net_type net(options); + // The final layer of the network must be a con_ layer that contains + // options.detector_windows.size() filters. This is because these final filters are + // what perform the final "sliding window" detection in the network. + net.subnet().layer_details().set_num_filters(options.detector_windows.size()); + + dnn_trainer trainer(net,sgd(0.0001,0.9)); + trainer.set_learning_rate(0.1); + trainer.be_verbose(); + trainer.set_iterations_without_progress_threshold(50000); + trainer.set_test_iterations_without_progress_threshold(1000); + const string sync_filename = "mmod_cars_sync"; + trainer.set_synchronization_file(sync_filename, std::chrono::minutes(5)); + + + + + std::vector> mini_batch_samples; + std::vector> mini_batch_labels; + random_cropper cropper; + cropper.set_seed(1); + cropper.set_chip_dims(350, 350); + cropper.set_min_object_size(0.20); + cropper.set_max_rotation_degrees(2); + dlib::rand rnd; + cout << trainer << cropper << endl; + + int cnt = 1; + // Run the trainer until the learning rate gets small. + while(trainer.get_learning_rate() >= 1e-4) + { + if (cnt%30 != 0 || images_test.size() == 0) + { + cropper(87, images_train, boxes_train, mini_batch_samples, mini_batch_labels); + // We can also randomly jitter the colors and that often helps a detector + // generalize better to new images. + for (auto&& img : mini_batch_samples) + disturb_colors(img, rnd); + + // It's a good idea to, at least once, put code here that displays the images + // and boxes the random cropper is generating. You should look at them and + // think about if the output makes sense for your problem. Most of the time + // it will be fine, but sometimes you will realize that the pattern of cropping + // isn't really appropriate for your problem and you will need to make some + // change to how the mini-batches are being generated. Maybe you will tweak + // some of the cropper's settings, or write your own entirely separate code to + // create mini-batches. But either way, if you don't look you will never know. + // An easy way to do this is to create a dlib::image_window to display the + // images and boxes. + + trainer.train_one_step(mini_batch_samples, mini_batch_labels); + } + else + { + cropper(87, images_test, boxes_test, mini_batch_samples, mini_batch_labels); + // We can also randomly jitter the colors and that often helps a detector + // generalize better to new images. + for (auto&& img : mini_batch_samples) + disturb_colors(img, rnd); + + trainer.test_one_step(mini_batch_samples, mini_batch_labels); + } + ++cnt; + } + // wait for training threads to stop + trainer.get_net(); + cout << "done training" << endl; + + // Save the network to disk + net.clean(); + serialize("mmod_rear_end_vehicle_detector.dat") << net; + + + // It's a really good idea to print the training parameters. This is because you will + // invariably be running multiple rounds of training and should be logging the output + // to a log file. This print statement will include many of the training parameters in + // your log. + cout << trainer << cropper << endl; + + cout << "\nsync_filename: " << sync_filename << endl; + cout << "num training images: "<< images_train.size() << endl; + cout << "training results: " << test_object_detection_function(net, images_train, boxes_train, test_box_overlap(), 0, options.overlaps_ignore); + upsample_image_dataset_limit>(images_train, boxes_train); + cout << "training upsampled results: " << test_object_detection_function(net, images_train, boxes_train, test_box_overlap(), 0, options.overlaps_ignore); + + + cout << "num testing images: "<< images_test.size() << endl; + cout << "testing results: " << test_object_detection_function(net, images_test, boxes_test, test_box_overlap(), 0, options.overlaps_ignore); + upsample_image_dataset_limit>(images_test, boxes_test); + cout << "testing upsampled results: " << test_object_detection_function(net, images_test, boxes_test, test_box_overlap(), 0, options.overlaps_ignore); + + /* + This program takes many hours to execute on a high end GPU. It took about a day to + train on an NVIDIA 1080ti. The resulting model file is available at + http://dlib.net/files/mmod_rear_end_vehicle_detector.dat.bz2 + It should be noted that this file on dlib.net has a dlib::shape_predictor appended + onto the end of it (see dnn_mmod_find_cars_ex.cpp for an example of its use). This + explains why the model file on dlib.net is larger than the + mmod_rear_end_vehicle_detector.dat output by this program. + + Also, the training and testing accuracies were: + + num training images: 2217 + training results: 0.990738 0.736431 0.736073 + training upsampled results: 0.986837 0.937694 0.936912 + num testing images: 135 + testing results: 0.988827 0.471372 0.470806 + testing upsampled results: 0.987879 0.651132 0.650399 + */ + + return 0; + +} +catch(std::exception& e) +{ + cout << e.what() << endl; +} + + + + diff --git a/examples/mmod_cars_test_image.jpg b/examples/mmod_cars_test_image.jpg new file mode 100644 index 000000000..cfffffe66 Binary files /dev/null and b/examples/mmod_cars_test_image.jpg differ