mirror of https://github.com/davisking/dlib.git
Made copy_tensor() use cudaMemcpyAsync() rather than cudaMemcpy().
This commit is contained in:
parent
aafa411672
commit
89c9267e46
|
@ -1401,7 +1401,7 @@ namespace dlib
|
|||
|
||||
for (long i = 0; i < src.num_samples(); ++i)
|
||||
{
|
||||
CHECK_CUDA(cudaMemcpy(dest_p, src_p, block_size * sizeof(float), cudaMemcpyDeviceToDevice));
|
||||
CHECK_CUDA(cudaMemcpyAsync(dest_p, src_p, block_size * sizeof(float), cudaMemcpyDeviceToDevice));
|
||||
|
||||
dest_p += dest_sample_size;
|
||||
src_p += src_sample_size;
|
||||
|
|
|
@ -1558,6 +1558,7 @@ namespace dlib { namespace tt
|
|||
- dest.k() - dest_k_offset >= count_k
|
||||
- src.k() - src_k_offset >= count_k
|
||||
- is_same_object(dest,src) == false
|
||||
- The memory areas of src and dest do not overlap.
|
||||
ensures
|
||||
- performs: dest[i, k + dest_k_offset, r, c] = src[i, k + src_k_offset, r, c], where k in [0..count_k]
|
||||
Copies content of each sample from src in to corresponding place of sample at dest.
|
||||
|
|
Loading…
Reference in New Issue