diff --git a/dlib/cuda/gpu_data.cpp b/dlib/cuda/gpu_data.cpp
index 6e7cec6be..3c999835d 100644
--- a/dlib/cuda/gpu_data.cpp
+++ b/dlib/cuda/gpu_data.cpp
@@ -118,6 +118,25 @@ namespace dlib
         }
     }
 
+#ifdef WIN32
+    // This should be pretty much the same as cudaStreamSynchronize, which for some
+    // reason makes training freeze on some Windows machines.
+    // (see https://github.com/davisking/dlib/issues/1513)
+    void synchronize_stream(cudaStream_t stream)
+    {
+        while (true)
+        {
+            cudaError_t err = cudaStreamQuery(stream);
+            switch (err)
+            {
+            case cudaSuccess: return;      // now we are synchronized
+            case cudaErrorNotReady: break; // continue waiting
+            default: CHECK_CUDA(err);      // unexpected error: throw
+            }
+        }
+    }
+#endif // WIN32
+
     void gpu_data::
     async_copy_to_device() const
     {
@@ -127,7 +146,12 @@ namespace dlib
             {
                 // Wait for any possible CUDA kernels that might be using our memory block to
                 // complete before we overwrite the memory.
+#ifdef WIN32
+                synchronize_stream(0);
+#else
                 CHECK_CUDA(cudaStreamSynchronize(0));
+#endif
+
                 device_in_use = false;
             }
             CHECK_CUDA(cudaMemcpyAsync(data_device.get(), data_host.get(), data_size*sizeof(float), cudaMemcpyHostToDevice, (cudaStream_t)cuda_stream.get()));