diff --git a/dlib/dnn/cuda_utils.h b/dlib/dnn/cuda_utils.h
index 141d802b8..bc96c0f95 100644
--- a/dlib/dnn/cuda_utils.h
+++ b/dlib/dnn/cuda_utils.h
@@ -261,7 +261,7 @@ namespace dlib
                     CUDA's y thread index (e.g. threadIdx.y) instead of the x index.
                     Therefore, if you launch a cuda kernel with a statement like:
                         dim3 blocks(10,1);
-                        dim3 threads(32,32);  // You need to have x any not equal to 1 to get parallelism over both loops.
+                        dim3 threads(32,32);  // You need to have x and y not equal to 1 to get parallelism over both loops.
                         add_arrays<<<blocks,threads>>>(a,b,out,nr,nc);
                     You can perform a nested 2D parallel for loop rather than doing just a
                     1D for loop.