Compile fix

This commit is contained in:
AlexeyAB 2018-12-07 22:52:07 +03:00
parent 7c2f302321
commit 742bb7c7ce
2 changed files with 3 additions and 3 deletions

View File

@ -7,7 +7,7 @@ OPENMP=0
LIBSO=0
# set GPU=1 and CUDNN=1 to speedup on GPU
# set CUDNN_HALF=1 to further speedup 3 x times (Mixed-precision using Tensor Cores) on GPU Tesla V100, Titan V, DGX-2
# set CUDNN_HALF=1 to further speedup 3 x times (Mixed-precision on Tensor Cores) GPU: Volta, Xavier, Turing and higher
# set AVX=1 and OPENMP=1 to speedup on CPU (if error occurs then set AVX=0)
DEBUG=0

View File

@ -138,8 +138,8 @@ void fast_binarize_weights_gpu(float *weights, int n, int size, float *binary, f
__global__ void cuda_f32_to_f16(float* input_f32, size_t size, half *output_f16)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
//if (idx < size) output_f16[idx] = __float2half(input_f32[idx]);
if (idx < size) output_f16[idx] = __float2half_rn(input_f32[idx]);
if (idx < size) output_f16[idx] = __float2half(input_f32[idx]);
//if (idx < size) output_f16[idx] = __float2half_rn(input_f32[idx]); // can't be compiled on Linux without casting
// __float2half_ru, __float2half_rd, __float2half_rz, __float2half_rn
//if (idx < size) *((unsigned short *)output_f16 + idx) = __float2half(input_f32[idx]);
}