col2im maybe a little faster

This commit is contained in:
Joseph Redmon 2014-10-30 11:28:37 -07:00
parent 27d0c922ea
commit 2b2441313b
4 changed files with 25 additions and 10 deletions

View File

@ -308,7 +308,7 @@ void train_assira()
void train_imagenet() void train_imagenet()
{ {
network net = parse_network_cfg("cfg/imagenet_backup_slowest_2340.cfg"); network net = parse_network_cfg("cfg/imagenet_small_830.cfg");
printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay); printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
int imgs = 1000/net.batch+1; int imgs = 1000/net.batch+1;
srand(6472345); srand(6472345);
@ -1016,6 +1016,17 @@ void test_gpu_net()
int main(int argc, char *argv[]) int main(int argc, char *argv[])
{ {
int i;
int ksize = 3;
int stride = 4;
int width_col = 20;
for(i = 0; i < 10; ++i){
int start = (i<ksize)?0:(i-ksize)/stride + 1;
int start2 = (i-ksize+stride)/stride;
int end = i/stride + 1;
end = (width_col < end) ? width_col : end;
printf("%d: %d vs %d, %d\n", i, start,start2, end);
}
if(argc != 2){ if(argc != 2){
fprintf(stderr, "usage: %s <function>\n", argv[0]); fprintf(stderr, "usage: %s <function>\n", argv[0]);
return 0; return 0;

View File

@ -21,13 +21,15 @@ __kernel void col2im(__global float *data_col, int batch,
id /= channels; id /= channels;
int b = id%batch; int b = id%batch;
int w_start = (w<ksize)?0:(w-ksize)/stride + 1; //int w_start = (w<ksize)?0:(w-ksize)/stride + 1;
int w_start = (w-ksize+stride)/stride;
int w_end = w/stride + 1; int w_end = w/stride + 1;
w_end = (width_col < w_end) ? width_col : w_end; //w_end = (width_col < w_end) ? width_col : w_end;
int h_start = (h<ksize)?0:(h-ksize)/stride+1; int h_start = (h-ksize+stride)/stride;
//int h_start = (h-ksize)/stride+1;
int h_end = h/stride + 1; int h_end = h/stride + 1;
h_end = (height_col < h_end) ? height_col : h_end; //h_end = (height_col < h_end) ? height_col : h_end;
int rows = channels * ksize * ksize; int rows = channels * ksize * ksize;
int cols = height_col*width_col; int cols = height_col*width_col;
@ -39,7 +41,9 @@ __kernel void col2im(__global float *data_col, int batch,
int h_col, w_col; int h_col, w_col;
for(h_col = h_start; h_col < h_end; ++h_col){ for(h_col = h_start; h_col < h_end; ++h_col){
for(w_col = w_start; w_col < w_end; ++w_col){ for(w_col = w_start; w_col < w_end; ++w_col){
val += data_col[offset +h_col*h_coeff + w_col*w_coeff]; int col_index = offset +h_col*h_coeff + w_col*w_coeff;
float part = (w_col < 0 || h_col < 0 || h_col >= height_col || w_col >= width_col) ? 0 : data_col[col_index];
val += part;
} }
} }
data_im[index] = val; data_im[index] = val;

View File

@ -336,7 +336,7 @@ void bias_output_gpu(const convolutional_layer layer)
cl.error = clSetKernelArg(kernel, i++, sizeof(layer.output_cl), (void*) &layer.output_cl); cl.error = clSetKernelArg(kernel, i++, sizeof(layer.output_cl), (void*) &layer.output_cl);
check_error(cl); check_error(cl);
const size_t global_size[] = {layer.batch, layer.n*size}; const size_t global_size[] = {layer.n*size, layer.batch};
clEnqueueNDRangeKernel(queue, kernel, 2, 0, global_size, 0, 0, 0, 0); clEnqueueNDRangeKernel(queue, kernel, 2, 0, global_size, 0, 0, 0, 0);
check_error(cl); check_error(cl);

View File

@ -1,10 +1,10 @@
__kernel void bias(int n, int size, __global float *biases, __global float *output) __kernel void bias(int n, int size, __global float *biases, __global float *output)
{ {
int batch = get_global_id(0); int id = get_global_id(0);
int id = get_global_id(1); int batch = get_global_id(1);
int filter = id/size; int filter = id/size;
int position = id%size; //int position = id%size;
output[batch*n*size + id] = biases[filter]; output[batch*n*size + id] = biases[filter];
} }