NAN in batchnorm_layer is fixed

This commit is contained in:
AlexeyAB 2020-02-16 18:23:49 +03:00
parent 8dc4833e7f
commit e910f4a839
2 changed files with 35 additions and 29 deletions

View File

@ -287,7 +287,7 @@ void normalize_cpu(float *x, float *mean, float *variance, int batch, int filter
for(f = 0; f < filters; ++f){ for(f = 0; f < filters; ++f){
for(i = 0; i < spatial; ++i){ for(i = 0; i < spatial; ++i){
int index = b*filters*spatial + f*spatial + i; int index = b*filters*spatial + f*spatial + i;
x[index] = (x[index] - mean[f])/(sqrt(variance[f]) + .000001f); x[index] = (x[index] - mean[f])/(sqrt(variance[f] + .000001f));
} }
} }
} }

View File

@ -9,24 +9,26 @@
#include "utils.h" #include "utils.h"
#include "tree.h" #include "tree.h"
__global__ void scale_bias_kernel(float *output, float *biases, int n, int size)
{
int offset = blockIdx.x * blockDim.x + threadIdx.x;
int filter = blockIdx.y;
int batch = blockIdx.z;
if(offset < size) output[(batch*n+filter)*size + offset] *= biases[filter]; __global__ void scale_bias_kernel(float *output, float *scale, int batch, int filters, int spatial, int current_size)
{
const int index = blockIdx.x*blockDim.x + threadIdx.x;
if (index >= current_size) return;
int f = (index / spatial) % filters;
output[index] *= scale[f];
} }
void scale_bias_gpu(float *output, float *biases, int batch, int n, int size) void scale_bias_gpu(float *output, float *scale, int batch, int filters, int spatial)
{ {
dim3 dimGrid((size-1)/BLOCK + 1, n, batch); const int current_size = batch * filters * spatial;
dim3 dimBlock(BLOCK, 1, 1); const int num_blocks = get_number_of_blocks(current_size, BLOCK);
scale_bias_kernel<<<dimGrid, dimBlock, 0, get_cuda_stream()>>>(output, biases, n, size); scale_bias_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> >(output, scale, batch, filters, spatial, current_size);
CHECK_CUDA(cudaPeekAtLastError()); CHECK_CUDA(cudaPeekAtLastError());
} }
__global__ void backward_scale_kernel(float *x_norm, float *delta, int batch, int n, int size, float *scale_updates) __global__ void backward_scale_kernel(float *x_norm, float *delta, int batch, int n, int size, float *scale_updates)
{ {
__shared__ float part[BLOCK]; __shared__ float part[BLOCK];
@ -53,21 +55,21 @@ void backward_scale_gpu(float *x_norm, float *delta, int batch, int n, int size,
CHECK_CUDA(cudaPeekAtLastError()); CHECK_CUDA(cudaPeekAtLastError());
} }
__global__ void add_bias_kernel(float *output, float *biases, int n, int size) __global__ void add_bias_kernel(float *output, float *biases, int batch, int filters, int spatial, int current_size)
{ {
int offset = blockIdx.x * blockDim.x + threadIdx.x; const int index = blockIdx.x*blockDim.x + threadIdx.x;
int filter = blockIdx.y; if (index >= current_size) return;
int batch = blockIdx.z;
if(offset < size) output[(batch*n+filter)*size + offset] += biases[filter]; int f = (index / spatial) % filters;
output[index] += biases[f];
} }
void add_bias_gpu(float *output, float *biases, int batch, int n, int size) void add_bias_gpu(float *output, float *biases, int batch, int filters, int spatial)
{ {
dim3 dimGrid((size-1)/BLOCK + 1, n, batch); const int current_size = batch * filters * spatial;
dim3 dimBlock(BLOCK, 1, 1); const int num_blocks = get_number_of_blocks(current_size, BLOCK);
add_bias_kernel<<<dimGrid, dimBlock, 0, get_cuda_stream()>>>(output, biases, n, size); add_bias_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> >(output, biases, batch, filters, spatial, current_size);
CHECK_CUDA(cudaPeekAtLastError()); CHECK_CUDA(cudaPeekAtLastError());
} }
@ -173,11 +175,20 @@ extern "C" void adam_update_gpu(float *w, float *d, float *m, float *v, float B1
__global__ void normalize_kernel(int N, float *x, float *mean, float *variance, int batch, int filters, int spatial) __global__ void normalize_kernel(int N, float *x, float *mean, float *variance, int batch, int filters, int spatial)
{ {
int index = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; const int index = blockIdx.x*blockDim.x + threadIdx.x;
if (index >= N) return; if (index >= N) return;
int f = (index/spatial)%filters; int f = (index / spatial) % filters;
x[index] = (x[index] - mean[f])/(sqrtf(variance[f]) + .000001f); x[index] = (x[index] - mean[f]) / (sqrtf(variance[f] + .000001f));
}
extern "C" void normalize_gpu(float *x, float *mean, float *variance, int batch, int filters, int spatial)
{
const int current_size = batch * filters * spatial;
const int num_blocks = get_number_of_blocks(current_size, BLOCK);
normalize_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> >(current_size, x, mean, variance, batch, filters, spatial);
CHECK_CUDA(cudaPeekAtLastError());
} }
__global__ void normalize_delta_kernel(int N, float *x, float *mean, float *variance, float *mean_delta, float *variance_delta, int batch, int filters, int spatial, float *delta) __global__ void normalize_delta_kernel(int N, float *x, float *mean, float *variance, float *mean_delta, float *variance_delta, int batch, int filters, int spatial, float *delta)
@ -459,12 +470,7 @@ __global__ void mul_kernel(int N, float *X, int INCX, float *Y, int INCY)
} }
extern "C" void normalize_gpu(float *x, float *mean, float *variance, int batch, int filters, int spatial)
{
size_t N = batch*filters*spatial;
normalize_kernel<<<cuda_gridsize(N), BLOCK, 0, get_cuda_stream()>>>(N, x, mean, variance, batch, filters, spatial);
CHECK_CUDA(cudaPeekAtLastError());
}
__global__ void fast_mean_kernel(float *x, int batch, int filters, int spatial, float *mean) __global__ void fast_mean_kernel(float *x, int batch, int filters, int spatial, float *mean)
{ {