[net] mosaic_bound=1 fixed bboxes going out of bounds. Count rewritten bboxes. Changed self-adversarial-training.

2020-06-07 04:41:01 +03:00 · 2020-06-07 04:41:01 +03:00 · 6c6f04a9b3
parent cde72f8e0b
commit 6c6f04a9b3
8 changed files with 50 additions and 16 deletions
--- a/include/darknet.h
+++ b/include/darknet.h
@ -669,6 +669,8 @@ typedef struct network {
    float *output;
    learning_rate_policy policy;
    int benchmark_layers;
+    int *total_bbox;
+    int *rewritten_bbox;

    float learning_rate;
    float learning_rate_min;
@ -718,6 +720,7 @@ typedef struct network {
    float adversarial_lr;
    float max_chart_loss;
    int letter_box;
+    int mosaic_bound;
    float angle;
    float aspect;
    float exposure;
@ -894,6 +897,7 @@ typedef struct load_args {
    int track;
    int augment_speed;
    int letter_box;
+    int mosaic_bound;
    int show_imgs;
    int dontuse_opencv;
    float jitter;
--- a/src/data.c
+++ b/src/data.c
@ -848,7 +848,7 @@ void blend_truth(float *new_truth, int boxes, float *old_truth)

 void blend_truth_mosaic(float *new_truth, int boxes, float *old_truth, int w, int h, float cut_x, float cut_y, int i_mixup,
    int left_shift, int right_shift, int top_shift, int bot_shift,
-    int net_w, int net_h)
+    int net_w, int net_h, int mosaic_bound)
 {
    const float lowest_w = 1.F / net_w;
    const float lowest_h = 1.F / net_h;
@ -900,7 +900,7 @@ void blend_truth_mosaic(float *new_truth, int boxes, float *old_truth, int w, in
        int top = (yb - hb / 2)*h;
        int bot = (yb + hb / 2)*h;

-
+        if(mosaic_bound)
        {
            // fix out of Mosaic-bound
            float left_bound = 0, right_bound = 0, top_bound = 0, bot_bound = 0;
@ -947,8 +947,7 @@ void blend_truth_mosaic(float *new_truth, int boxes, float *old_truth, int w, in
            yb = ((float)(bot + top) / 2) / h;
            hb = ((float)(bot - top)) / h;
        }
-
-        /*
+        else
        {
            // fix out of bound
            if (left < 0) {
@ -980,7 +979,7 @@ void blend_truth_mosaic(float *new_truth, int boxes, float *old_truth, int w, in
            top = (yb - hb / 2)*h;
            bot = (yb + hb / 2)*h;
        }
-        */
+

        // leave only within the image
        if(left >= 0 && right <= w && top >= 0 && bot <= h &&
@ -1004,7 +1003,7 @@ void blend_truth_mosaic(float *new_truth, int boxes, float *old_truth, int w, in
 #include "http_stream.h"

 data load_data_detection(int n, char **paths, int m, int w, int h, int c, int boxes, int classes, int use_flip, int use_gaussian_noise, int use_blur, int use_mixup,
-    float jitter, float resize, float hue, float saturation, float exposure, int mini_batch, int track, int augment_speed, int letter_box, int show_imgs)
+    float jitter, float resize, float hue, float saturation, float exposure, int mini_batch, int track, int augment_speed, int letter_box, int mosaic_bound, int show_imgs)
 {
    const int random_index = random_gen();
    c = c ? c : 3;
@ -1263,7 +1262,7 @@ data load_data_detection(int n, char **paths, int m, int w, int h, int c, int bo
                    }
                }

-                blend_truth_mosaic(d.y.vals[i], boxes, truth, w, h, cut_x[i], cut_y[i], i_mixup, left_shift, right_shift, top_shift, bot_shift, w, h);
+                blend_truth_mosaic(d.y.vals[i], boxes, truth, w, h, cut_x[i], cut_y[i], i_mixup, left_shift, right_shift, top_shift, bot_shift, w, h, mosaic_bound);

                free_image(ai);
                ai.data = d.X.vals[i];
@ -1319,7 +1318,7 @@ void blend_images(image new_img, float alpha, image old_img, float beta)
 }

 data load_data_detection(int n, char **paths, int m, int w, int h, int c, int boxes, int classes, int use_flip, int gaussian_noise, int use_blur, int use_mixup,
-    float jitter, float resize, float hue, float saturation, float exposure, int mini_batch, int track, int augment_speed, int letter_box, int show_imgs)
+    float jitter, float resize, float hue, float saturation, float exposure, int mini_batch, int track, int augment_speed, int letter_box, int mosaic_bound, int show_imgs)
 {
    const int random_index = random_gen();
    c = c ? c : 3;
@ -1534,7 +1533,7 @@ void *load_thread(void *ptr)
        *a.d = load_data_region(a.n, a.paths, a.m, a.w, a.h, a.num_boxes, a.classes, a.jitter, a.hue, a.saturation, a.exposure);
    } else if (a.type == DETECTION_DATA){
        *a.d = load_data_detection(a.n, a.paths, a.m, a.w, a.h, a.c, a.num_boxes, a.classes, a.flip, a.gaussian_noise, a.blur, a.mixup, a.jitter, a.resize,
-            a.hue, a.saturation, a.exposure, a.mini_batch, a.track, a.augment_speed, a.letter_box, a.show_imgs);
+            a.hue, a.saturation, a.exposure, a.mini_batch, a.track, a.augment_speed, a.letter_box, a.mosaic_bound, a.show_imgs);
    } else if (a.type == SWAG_DATA){
        *a.d = load_data_swag(a.paths, a.n, a.classes, a.jitter);
    } else if (a.type == COMPARE_DATA){
--- a/src/data.h
+++ b/src/data.h
@ -87,7 +87,7 @@ data load_data_captcha(char **paths, int n, int m, int k, int w, int h);
 data load_data_captcha_encode(char **paths, int n, int m, int w, int h);
 data load_data_old(char **paths, int n, int m, char **labels, int k, int w, int h);
 data load_data_detection(int n, char **paths, int m, int w, int h, int c, int boxes, int classes, int use_flip, int gaussian_noise, int use_blur, int use_mixup,
-    float jitter, float resize, float hue, float saturation, float exposure, int mini_batch, int track, int augment_speed, int letter_box, int show_imgs);
+    float jitter, float resize, float hue, float saturation, float exposure, int mini_batch, int track, int augment_speed, int letter_box, int mosaic_bound, int show_imgs);
 data load_data_tag(char **paths, int n, int m, int k, int use_flip, int min, int max, int w, int h, float angle, float aspect, float hue, float saturation, float exposure);
 matrix load_image_augment_paths(char **paths, int n, int use_flip, int min, int max, int w, int h, float angle, float aspect, float hue, float saturation, float exposure, int dontuse_opencv);
 data load_data_super(char **paths, int n, int m, int w, int h, int scale);
--- a/src/detector.c
+++ b/src/detector.c
@ -145,6 +145,7 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i
    args.saturation = net.saturation;
    args.hue = net.hue;
    args.letter_box = net.letter_box;
+    args.mosaic_bound = net.mosaic_bound;
    if (dont_show && show_imgs) show_imgs = 2;
    args.show_imgs = show_imgs;

--- a/src/network.c
+++ b/src/network.c
@ -246,6 +246,9 @@ network make_network(int n)
    net.layers = (layer*)xcalloc(net.n, sizeof(layer));
    net.seen = (uint64_t*)xcalloc(1, sizeof(uint64_t));
    net.cur_iteration = (int*)xcalloc(1, sizeof(int));
+    net.total_bbox = (int*)xcalloc(1, sizeof(int));
+    net.rewritten_bbox = (int*)xcalloc(1, sizeof(int));
+    *net.rewritten_bbox = *net.total_bbox = 0;
 #ifdef GPU
    net.input_gpu = (float**)xcalloc(1, sizeof(float*));
    net.truth_gpu = (float**)xcalloc(1, sizeof(float*));
@ -366,6 +369,7 @@ float train_network_datum(network net, float *x, float *y)
    backward_network(net, state);
    float error = get_network_cost(net);
    //if(((*net.seen)/net.batch)%net.subdivisions == 0) update_network(net);
+    printf(" total_bbox = %d, rewritten_bbox = %f %% \n", *(state.net.total_bbox), 100 * (float)*(state.net.rewritten_bbox) / *(state.net.total_bbox));
    return error;
 }

@ -1147,6 +1151,8 @@ void free_network(network net)
    free(net.steps);
    free(net.seen);
    free(net.cur_iteration);
+    free(net.total_bbox);
+    free(net.rewritten_bbox);

 #ifdef GPU
    if (gpu_index >= 0) cuda_free(net.workspace);
--- a/src/network_kernels.cu
+++ b/src/network_kernels.cu
@ -348,6 +348,7 @@ void forward_backward_network_gpu(network net, float *x, float *y)
        cuda_free(state.delta);
        cuda_pull_array(*net.input_gpu, x, x_size);
    }
+    printf(" total_bbox = %d, rewritten_bbox = %f %% \n", *(state.net.total_bbox), 100 * (float)*(state.net.rewritten_bbox) / *(state.net.total_bbox));
 }

 float train_network_datum_gpu(network net, float *x, float *y)
@ -356,7 +357,8 @@ float train_network_datum_gpu(network net, float *x, float *y)
    if (net.adversarial_lr && rand_int(0, 1) == 1 && get_current_iteration(net) > net.burn_in) {
        net.adversarial = 1;
        float lr_old = net.learning_rate;
-        float scale = 1.0 - (get_current_iteration(net) / ((float)net.max_batches));
+        float scale = (get_current_iteration(net) / ((float)net.max_batches));
+        //scale = sin(scale * M_PI);
        net.learning_rate = net.adversarial_lr * scale;
        layer l = net.layers[net.n - 1];
        int y_size = get_network_output_size(net)*net.batch;
--- a/src/parser.c
+++ b/src/parser.c
@ -1119,6 +1119,7 @@ void parse_net_options(list *options, network *net)
    else if (cutmix) net->mixup = 2;
    else if (mosaic) net->mixup = 3;
    net->letter_box = option_find_int_quiet(options, "letter_box", 0);
+    net->mosaic_bound = option_find_int_quiet(options, "mosaic_bound", 0);
    net->label_smooth_eps = option_find_float_quiet(options, "label_smooth_eps", 0.0f);
    net->resize_step = option_find_float_quiet(options, "resize_step", 32);
    net->attention = option_find_int_quiet(options, "attention", 0);
--- a/src/yolo_layer.c
+++ b/src/yolo_layer.c
@ -153,8 +153,12 @@ static inline float clip_value(float val, const float max_val)
    return val;
 }

-ious delta_yolo_box(box truth, float *x, float *biases, int n, int index, int i, int j, int lw, int lh, int w, int h, float *delta, float scale, int stride, float iou_normalizer, IOU_LOSS iou_loss, int accumulate, float max_delta)
+ious delta_yolo_box(box truth, float *x, float *biases, int n, int index, int i, int j, int lw, int lh, int w, int h, float *delta, float scale, int stride, float iou_normalizer, IOU_LOSS iou_loss, int accumulate, float max_delta, int *rewritten_bbox)
 {
+    if (delta[index + 0 * stride] || delta[index + 1 * stride] || delta[index + 2 * stride] || delta[index + 3 * stride]) {
+        (*rewritten_bbox)++;
+    }
+
    ious all_ious = { 0 };
    // i - step in layer width
    // j - step in layer height
@ -422,9 +426,23 @@ void forward_yolo_layer(const layer l, network_state state)
                        if (scale > 0) scale = sqrt(scale);
                        l.delta[obj_index] = scale * l.cls_normalizer * (0 - l.output[obj_index]);
                        int cl_id;
+                        int found_object = 0;
                        for (cl_id = 0; cl_id < l.classes; ++cl_id) {
-                            if(l.output[class_index + stride*cl_id] * l.output[obj_index] > 0.25)
+                            if (l.output[class_index + stride*cl_id] * l.output[obj_index] > 0.25) {
                                l.delta[class_index + stride*cl_id] = scale * (0 - l.output[class_index + stride*cl_id]);
+                                found_object = 1;
+                            }
+                        }
+                        if (found_object) {
+                            // don't use this loop for adversarial attack drawing
+                            for (cl_id = 0; cl_id < l.classes; ++cl_id)
+                                if (l.output[class_index + stride*cl_id] * l.output[obj_index] < 0.25)
+                                    l.delta[class_index + stride*cl_id] = scale * (1 - l.output[class_index + stride*cl_id]);
+
+                            l.delta[box_index + 0 * stride] += scale * (0 - l.output[box_index + 0 * stride]);
+                            l.delta[box_index + 1 * stride] += scale * (0 - l.output[box_index + 1 * stride]);
+                            l.delta[box_index + 2 * stride] += scale * (0 - l.output[box_index + 2 * stride]);
+                            l.delta[box_index + 3 * stride] += scale * (0 - l.output[box_index + 3 * stride]);
                        }
                    }
                    if (best_iou > l.truth_thresh) {
@ -439,7 +457,8 @@ void forward_yolo_layer(const layer l, network_state state)
                        const float class_multiplier = (l.classes_multipliers) ? l.classes_multipliers[class_id] : 1.0f;
                        if (l.objectness_smooth) l.delta[class_index + stride*class_id] = class_multiplier * (iou_multiplier - l.output[class_index + stride*class_id]);
                        box truth = float_to_box_stride(state.truth + best_t*(4 + 1) + b*l.truths, 1);
-                        delta_yolo_box(truth, l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.delta, (2 - truth.w*truth.h), l.w*l.h, l.iou_normalizer * class_multiplier, l.iou_loss, 1, l.max_delta);
+                        delta_yolo_box(truth, l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.delta, (2 - truth.w*truth.h), l.w*l.h, l.iou_normalizer * class_multiplier, l.iou_loss, 1, l.max_delta, state.net.rewritten_bbox);
+                        (*state.net.total_bbox)++;
                    }
                }
            }
@ -481,7 +500,8 @@ void forward_yolo_layer(const layer l, network_state state)

                int box_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 0);
                const float class_multiplier = (l.classes_multipliers) ? l.classes_multipliers[class_id] : 1.0f;
-                ious all_ious = delta_yolo_box(truth, l.output, l.biases, best_n, box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.delta, (2 - truth.w*truth.h), l.w*l.h, l.iou_normalizer * class_multiplier, l.iou_loss, 1, l.max_delta);
+                ious all_ious = delta_yolo_box(truth, l.output, l.biases, best_n, box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.delta, (2 - truth.w*truth.h), l.w*l.h, l.iou_normalizer * class_multiplier, l.iou_loss, 1, l.max_delta, state.net.rewritten_bbox);
+                (*state.net.total_bbox)++;

                // range is 0 <= 1
                tot_iou += all_ious.iou;
@ -528,7 +548,8 @@ void forward_yolo_layer(const layer l, network_state state)

                        int box_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 0);
                        const float class_multiplier = (l.classes_multipliers) ? l.classes_multipliers[class_id] : 1.0f;
-                        ious all_ious = delta_yolo_box(truth, l.output, l.biases, n, box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.delta, (2 - truth.w*truth.h), l.w*l.h, l.iou_normalizer * class_multiplier, l.iou_loss, 1, l.max_delta);
+                        ious all_ious = delta_yolo_box(truth, l.output, l.biases, n, box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.delta, (2 - truth.w*truth.h), l.w*l.h, l.iou_normalizer * class_multiplier, l.iou_loss, 1, l.max_delta, state.net.rewritten_bbox);
+                        (*state.net.total_bbox)++;

                        // range is 0 <= 1
                        tot_iou += all_ious.iou;