diff --git a/include/darknet.h b/include/darknet.h
index 32077ccf..e7063e30 100644
--- a/include/darknet.h
+++ b/include/darknet.h
@@ -727,6 +727,7 @@ typedef struct network {
     int letter_box;
     int mosaic_bound;
     int contrastive;
+    int unsupervised;
     float angle;
     float aspect;
     float exposure;
diff --git a/src/blas.c b/src/blas.c
index adcce984..7bfc7528 100644
--- a/src/blas.c
+++ b/src/blas.c
@@ -590,9 +590,10 @@ void grad_contrastive_loss_positive(int i, int *labels, int num_of_samples, floa
         fprintf(stderr, " Error: N == 0 || temperature == 0 || vec_len == 0. N=%f, temperature=%f, vec_len=%f \n", N, temperature, vec_len);
         getchar();
     }
-    const float mult = 1 / ((2 * N - 1) * temperature * vec_len);
+    const float mult = 1 / ((N - 1) * temperature * vec_len);
 
     for (j = 0; j < num_of_samples; ++j) {
+        //if (i != j && (i/2) == (j/2)) {
         if (i != j && labels[i] == labels[j]) {
             const float sim = cos_sim[i*num_of_samples + j];        // cosine_similarity(z[i], z[j], feature_size);
             const float P = p_constrastive[i*num_of_samples + j];   // P_constrastive(i, j, labels, num_of_samples, z, feature_size, temperature, cos_sim);
@@ -600,8 +601,8 @@ void grad_contrastive_loss_positive(int i, int *labels, int num_of_samples, floa
 
             int m;
             for (m = 0; m < feature_size; ++m) {
-                //const float d = mult*(sim * z[i][m] - z[j][m]) * (1 - P); // bad
-                const float d = mult*(sim * z[j][m] - z[j][m]) * (1 - P); // good
+                const float d = mult*(sim * z[i][m] - z[j][m]) * (1 - P); // good
+                //const float d = mult*(sim * z[j][m] - z[j][m]) * (1 - P); // bad
                // printf(" pos: z[j][m] = %f, z[i][m] = %f, d = %f, sim = %f \n", z[j][m], z[i][m], d, sim);
                 delta[m] -= d;
             }
@@ -626,9 +627,10 @@ void grad_contrastive_loss_negative(int i, int *labels, int num_of_samples, floa
         fprintf(stderr, " Error: N == 0 || temperature == 0 || vec_len == 0. N=%f, temperature=%f, vec_len=%f \n", N, temperature, vec_len);
         getchar();
     }
-    const float mult = 1 / ((2 * N - 1) * temperature * vec_len);
+    const float mult = 1 / ((N - 1) * temperature * vec_len);
 
     for (j = 0; j < num_of_samples; ++j) {
+        //if (i != j && (i/2) == (j/2)) {
         if (i != j && labels[i] == labels[j]) {
 
             int k;
@@ -641,8 +643,8 @@ void grad_contrastive_loss_negative(int i, int *labels, int num_of_samples, floa
 
                     int m;
                     for (m = 0; m < feature_size; ++m) {
-                        //const float d = mult*(z[k][m] - sim * z[i][m]) * P;   // bad
-                        const float d = mult*(z[k][m] - sim * z[k][m]) * P; // good
+                        const float d = mult*(z[k][m] - sim * z[i][m]) * P;   // good
+                        //const float d = mult*(z[k][m] - sim * z[k][m]) * P; // bad
                         //printf(" neg: z[k][m] = %f, z[i][m] = %f, d = %f, sim = %f \n", z[k][m], z[i][m], d, sim);
                         delta[m] -= d;
                     }
diff --git a/src/classifier.c b/src/classifier.c
index e340ed96..915be9cd 100644
--- a/src/classifier.c
+++ b/src/classifier.c
@@ -69,13 +69,18 @@ void train_classifier(char *datacfg, char *cfgfile, char *weightfile, int *gpus,
     int topk_data = option_find_int(options, "top", 5);
     char topk_buff[10];
     sprintf(topk_buff, "top%d", topk_data);
-    if (classes != net.layers[net.n - 1].inputs) {
+    layer l = net.layers[net.n - 1];
+    if (classes != l.outputs && (l.type == SOFTMAX || l.type == COST)) {
         printf("\n Error: num of filters = %d in the last conv-layer in cfg-file doesn't match to classes = %d in data-file \n",
-            net.layers[net.n - 1].inputs, classes);
+            l.outputs, classes);
         getchar();
     }
 
     char **labels = get_labels(label_list);
+    if (net.unsupervised) {
+        free(labels);
+        labels = NULL;
+    }
     list *plist = get_paths(train_list);
     char **paths = (char **)list_to_array(plist);
     printf("%d\n", plist->size);
@@ -184,8 +189,16 @@ void train_classifier(char *datacfg, char *cfgfile, char *weightfile, int *gpus,
         int draw_precision = 0;
         if (calc_topk && (i >= calc_topk_for_each || i == net.max_batches)) {
             iter_topk = i;
-            topk = validate_classifier_single(datacfg, cfgfile, weightfile, &net, topk_data); // calc TOP-n
-            printf("\n accuracy %s = %f \n", topk_buff, topk);
+            if (net.contrastive && l.type != SOFTMAX && l.type != COST) {
+                int k;
+                for (k = 0; k < net.n; ++k) if (net.layers[k].type == CONTRASTIVE) break;
+                topk = *(net.layers[k].loss) / 100;
+                sprintf(topk_buff, "Contr");
+            }
+            else {
+                topk = validate_classifier_single(datacfg, cfgfile, weightfile, &net, topk_data); // calc TOP-n
+                printf("\n accuracy %s = %f \n", topk_buff, topk);
+            }
             draw_precision = 1;
         }
 
@@ -240,7 +253,7 @@ void train_classifier(char *datacfg, char *cfgfile, char *weightfile, int *gpus,
     free(nets);
 
     //free_ptrs((void**)labels, classes);
-    free(labels);
+    if(labels) free(labels);
     free_ptrs((void**)paths, plist->size);
     free_list(plist);
     free(nets);
@@ -820,9 +833,10 @@ void predict_classifier(char *datacfg, char *cfgfile, char *weightfile, char *fi
     if(!name_list) name_list = option_find_str(options, "labels", "data/labels.list");
     int classes = option_find_int(options, "classes", 2);
     printf(" classes = %d, output in cfg = %d \n", classes, net.layers[net.n - 1].c);
-    if (classes != net.layers[net.n - 1].inputs) {
+    layer l = net.layers[net.n - 1];
+    if (classes != l.outputs && (l.type == SOFTMAX || l.type == COST)) {
         printf("\n Error: num of filters = %d in the last conv-layer in cfg-file doesn't match to classes = %d in data-file \n",
-            net.layers[net.n - 1].inputs, classes);
+            l.outputs, classes);
         getchar();
     }
     if (top == 0) top = option_find_int(options, "top", 1);
diff --git a/src/data.c b/src/data.c
index eb1759db..d1fa4c12 100644
--- a/src/data.c
+++ b/src/data.c
@@ -613,7 +613,9 @@ matrix load_labels_paths(char **paths, int n, char **labels, int k, tree *hierar
     } else {
         // unsupervised learning
         for (i = 0; i < n; ++i) {
-            const int class_id = i / 2;
+            const int img_index = (contrastive) ? (i / 2) : i;
+            const uintptr_t path_p = (uintptr_t)paths[img_index];// abs(random_gen());
+            const int class_id = path_p % k;
             int l;
             for (l = 0; l < k; ++l) y.vals[i][l] = 0;
             y.vals[i][class_id] = 1;
diff --git a/src/parser.c b/src/parser.c
index 1306a773..c43ef678 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -1130,6 +1130,7 @@ void parse_net_options(list *options, network *net)
     net->letter_box = option_find_int_quiet(options, "letter_box", 0);
     net->mosaic_bound = option_find_int_quiet(options, "mosaic_bound", 0);
     net->contrastive = option_find_int_quiet(options, "contrastive", 0);
+    net->unsupervised = option_find_int_quiet(options, "unsupervised", 0);
     net->label_smooth_eps = option_find_float_quiet(options, "label_smooth_eps", 0.0f);
     net->resize_step = option_find_float_quiet(options, "resize_step", 32);
     net->attention = option_find_int_quiet(options, "attention", 0);
diff --git a/src/softmax_layer.c b/src/softmax_layer.c
index fb4e91bd..ffbefffc 100644
--- a/src/softmax_layer.c
+++ b/src/softmax_layer.c
@@ -141,7 +141,7 @@ contrastive_layer make_contrastive_layer(int batch, int w, int h, int n, int cla
     l.n = n;
     l.classes = classes;
     l.temperature = 1;
-    //l.loss = (float*)xcalloc(inputs * batch, sizeof(float));
+    l.loss = (float*)xcalloc(1, sizeof(float));
     l.output = (float*)xcalloc(inputs * batch, sizeof(float));
     l.delta = (float*)xcalloc(inputs * batch, sizeof(float));
     l.cost = (float*)xcalloc(1, sizeof(float));
@@ -156,7 +156,6 @@ contrastive_layer make_contrastive_layer(int batch, int w, int h, int n, int cla
     l.backward_gpu = backward_contrastive_layer_gpu;
 
     l.output_gpu = cuda_make_array(l.output, inputs*batch);
-    //l.loss_gpu = cuda_make_array(l.loss, inputs*batch);
     l.delta_gpu = cuda_make_array(l.delta, inputs*batch);
     //l.cos_sim_gpu = cuda_make_array(l.cos_sim, l.batch*l.batch);
 #endif
@@ -164,10 +163,10 @@ contrastive_layer make_contrastive_layer(int batch, int w, int h, int n, int cla
 }
 
 
-void forward_contrastive_layer(const contrastive_layer l, network_state state)
+void forward_contrastive_layer(contrastive_layer l, network_state state)
 {
     if (!state.train) return;
-    const float truth_thresh = 0.2;
+    const float truth_thresh = state.net.label_smooth_eps;
 
     memset(l.delta, 0, l.batch*l.inputs * sizeof(float));
 
@@ -183,7 +182,8 @@ void forward_contrastive_layer(const contrastive_layer l, network_state state)
                 for (n = 0; n < l.classes; ++n) {
                     const float truth_prob = state.truth[b*l.classes + n];
                     //printf(" truth_prob = %f, ", truth_prob);
-                    if (truth_prob > max_truth)
+                    //if (truth_prob > max_truth)
+                    if (truth_prob > truth_thresh)
                     {
                         max_truth = truth_prob;
                         l.labels[b] = n;
@@ -228,7 +228,8 @@ void forward_contrastive_layer(const contrastive_layer l, network_state state)
         //printf(" l.labels[b] = %d, l.labels[b+1] = %d, l.labels[b+2] = %d, b = %d \n", l.labels[b], l.labels[b + 1], l.labels[b + 2], b);
         //printf(" same = %f, aug = %f, diff = %f, (aug > diff) = %d \n", same, aug, diff, (aug > diff));
     }
-    printf("good contrast = %f %% \n", 100 * good_contrast / (l.batch/2));
+    *l.loss = 100 * good_contrast / (l.batch / 2);
+    printf(" Contrast accuracy = %f %% \n", *l.loss);
 
     // precalculate P_contrastive
     for (b = 0; b < l.batch; ++b) {
@@ -251,10 +252,10 @@ void forward_contrastive_layer(const contrastive_layer l, network_state state)
             {
                 //printf(" b = %d, ", b);
                 // positive
-                grad_contrastive_loss_positive(b, l.labels, l.batch, z, l.n, l.temperature, l.cos_sim, l.p_constrastive, l.delta);
+                grad_contrastive_loss_positive(b, l.labels, l.batch, z, l.n, l.temperature, l.cos_sim, l.p_constrastive, l.delta + b*l.inputs);
 
                 // negative
-                grad_contrastive_loss_negative(b, l.labels, l.batch, z, l.n, l.temperature, l.cos_sim, l.p_constrastive, l.delta);
+                grad_contrastive_loss_negative(b, l.labels, l.batch, z, l.n, l.temperature, l.cos_sim, l.p_constrastive, l.delta + b*l.inputs);
             }
         }
     }
@@ -264,9 +265,9 @@ void forward_contrastive_layer(const contrastive_layer l, network_state state)
     free(z);
 }
 
-void backward_contrastive_layer(const contrastive_layer l, network_state net)
+void backward_contrastive_layer(contrastive_layer l, network_state state)
 {
-    axpy_cpu(l.inputs*l.batch, 1, l.delta, 1, net.delta, 1);
+    axpy_cpu(l.inputs*l.batch, 1, l.delta, 1, state.delta, 1);
 }
 
 
@@ -283,10 +284,10 @@ void push_contrastive_layer_output(const contrastive_layer l)
 }
 
 
-void forward_contrastive_layer_gpu(const contrastive_layer l, network_state state)
+void forward_contrastive_layer_gpu(contrastive_layer l, network_state state)
 {
-    if (!state.train) return;
     simple_copy_ongpu(l.batch*l.inputs, state.input, l.output_gpu);
+    if (!state.train) return;
 
     float *in_cpu = (float *)xcalloc(l.batch*l.inputs, sizeof(float));
     cuda_pull_array(l.output_gpu, l.output, l.batch*l.outputs);
@@ -311,7 +312,7 @@ void forward_contrastive_layer_gpu(const contrastive_layer l, network_state stat
     if (cpu_state.truth) free(cpu_state.truth);
 }
 
-void backward_contrastive_layer_gpu(const contrastive_layer layer, network_state state)
+void backward_contrastive_layer_gpu(contrastive_layer layer, network_state state)
 {
     axpy_ongpu(layer.batch*layer.inputs, 1, layer.delta_gpu, 1, state.delta, 1);
 }
diff --git a/src/softmax_layer.h b/src/softmax_layer.h
index d49bddc4..f1985a99 100644
--- a/src/softmax_layer.h
+++ b/src/softmax_layer.h
@@ -23,14 +23,14 @@ void backward_softmax_layer_gpu(const softmax_layer l, network_state state);
 //-----------------------
 
 contrastive_layer make_contrastive_layer(int batch, int w, int h, int n, int classes, int inputs);
-void forward_contrastive_layer(const contrastive_layer l, network_state state);
-void backward_contrastive_layer(const contrastive_layer l, network_state net);
+void forward_contrastive_layer(contrastive_layer l, network_state state);
+void backward_contrastive_layer(contrastive_layer l, network_state net);
 
 #ifdef GPU
 void pull_contrastive_layer_output(const contrastive_layer l);
 void push_contrastive_layer_output(const contrastive_layer l);
-void forward_contrastive_layer_gpu(const contrastive_layer l, network_state state);
-void backward_contrastive_layer_gpu(const contrastive_layer layer, network_state state);
+void forward_contrastive_layer_gpu(contrastive_layer l, network_state state);
+void backward_contrastive_layer_gpu(contrastive_layer layer, network_state state);
 #endif
 
 #ifdef __cplusplus