Apparently fix #1513 by avoiding the cudaStreamSynchronize call (#1514)

* Problem: see #1513 (https://github.com/davisking/dlib/issues/1513)
Candidate solution: busy-loop until cudaStreamQuery returns cudaSuccess

* Make the suggested fix a Windows-only thing
This commit is contained in:
Juha Reunanen 2018-11-11 17:33:15 +02:00 committed by Davis E. King
parent d2dcdd66ad
commit d7e6f1d726
1 changed files with 24 additions and 0 deletions

View File

@ -118,6 +118,25 @@ namespace dlib
} }
} }
#ifdef WIN32
// This should be pretty much the same as cudaStreamSynchronize, which for some
// reason makes training freeze on some Windows machines.
// (see https://github.com/davisking/dlib/issues/1513)
void synchronize_stream(cudaStream_t stream)
{
while (true)
{
cudaError_t err = cudaStreamQuery(stream);
switch (err)
{
case cudaSuccess: return; // now we are synchronized
case cudaErrorNotReady: break; // continue waiting
default: CHECK_CUDA(err); // unexpected error: throw
}
}
}
#endif // WIN32
void gpu_data:: void gpu_data::
async_copy_to_device() const async_copy_to_device() const
{ {
@ -127,7 +146,12 @@ namespace dlib
{ {
// Wait for any possible CUDA kernels that might be using our memory block to // Wait for any possible CUDA kernels that might be using our memory block to
// complete before we overwrite the memory. // complete before we overwrite the memory.
#ifdef WIN32
synchronize_stream(0);
#else
CHECK_CUDA(cudaStreamSynchronize(0)); CHECK_CUDA(cudaStreamSynchronize(0));
#endif
device_in_use = false; device_in_use = false;
} }
CHECK_CUDA(cudaMemcpyAsync(data_device.get(), data_host.get(), data_size*sizeof(float), cudaMemcpyHostToDevice, (cudaStream_t)cuda_stream.get())); CHECK_CUDA(cudaMemcpyAsync(data_device.get(), data_host.get(), data_size*sizeof(float), cudaMemcpyHostToDevice, (cudaStream_t)cuda_stream.get()));