diff --git a/dlib/dnn/cuda_utils.h b/dlib/dnn/cuda_utils.h index 141d802b8..bc96c0f95 100644 --- a/dlib/dnn/cuda_utils.h +++ b/dlib/dnn/cuda_utils.h @@ -261,7 +261,7 @@ namespace dlib CUDA's y thread index (e.g. threadIdx.y) instead of the x index. Therefore, if you launch a cuda kernel with a statement like: dim3 blocks(10,1); - dim3 threads(32,32); // You need to have x any not equal to 1 to get parallelism over both loops. + dim3 threads(32,32); // You need to have x and y not equal to 1 to get parallelism over both loops. add_arrays<<>>(a,b,out,nr,nc); You can perform a nested 2D parallel for loop rather than doing just a 1D for loop.