Moved gpu_data into its own file, fixed a few bugs, and cleaned up

a few things.
This commit is contained in:
Davis King 2015-10-17 22:02:04 -04:00
parent 0508fe2bf1
commit 3ba21975f7
9 changed files with 481 additions and 388 deletions

View File

@ -441,6 +441,7 @@ if (NOT TARGET dlib)
dnn/cuda_dlib.cu
dnn/cudnn_dlibapi.cpp
dnn/cublas_dlibapi.cpp
dnn/gpu_data.cpp
)
set(dlib_needed_libraries ${dlib_needed_libraries} ${CUDA_CUBLAS_LIBRARIES} ${cudnn})
include_directories(${cudnn_include})

View File

@ -29,8 +29,19 @@ namespace dlib
cublas_context(const cublas_context&) = delete;
cublas_context& operator=(const cublas_context&) = delete;
// but is movable
cublas_context(cublas_context&&) = default;
cublas_context& operator=(cublas_context&&) = default;
cublas_context(cublas_context&& item)
{
handle = item.handle;
item.handle = nullptr;
}
cublas_context& operator=(cublas_context&& item)
{
if (this == &item)
return *this;
handle = item.handle;
item.handle = nullptr;
return *this;
}
cublas_context();
~cublas_context();

View File

@ -3,27 +3,22 @@
#ifndef DLIB_CUDA_ERRORs_H_
#define DLIB_CUDA_ERRORs_H_
#ifdef DLIB_USE_CUDA
#include "../error.h"
namespace dlib
{
namespace cuda
struct cuda_error : public error
{
struct cuda_error : public error
{
cuda_error(const std::string& message): error(message) {}
};
cuda_error(const std::string& message): error(message) {}
};
struct cudnn_error : public cuda_error
{
cudnn_error(const std::string& message): cuda_error(message) {}
};
}
struct cudnn_error : public cuda_error
{
cudnn_error(const std::string& message): cuda_error(message) {}
};
}
#endif // DLIB_USE_CUDA
#endif // DLIB_CUDA_ERRORs_H_

View File

@ -3,10 +3,13 @@
#ifndef DLIB_CUDA_UtILS_H_
#define DLIB_CUDA_UtILS_H_
#ifndef DLIB_USE_CUDA
#error "This file shouldn't be #included unless DLIB_USE_CUDA is #defined"
#endif
#include "cuda_errors.h"
#include <cuda.h>
#include <cuda_runtime.h>
#include <sstream>
@ -19,7 +22,7 @@
std::ostringstream sout; \
sout << "Error while calling " << #call << " in file " << __FILE__ << ":" << __LINE__ << ". ";\
sout << "code: " << error << ", reason: " << cudaGetErrorString(error);\
throw dlib::cuda::cuda_error(sout.str()); \
throw dlib::cuda_error(sout.str()); \
} \
}
@ -27,75 +30,82 @@
#ifdef __CUDACC__
class grid_stride_range
namespace dlib
{
/*!
WHAT THIS OBJECT REPRESENTS
This is a tool for making a for loop that loops over an entire block of memory
inside a kernel, but doing so in a way that parallelizes appropriately across
all the threads in a kernel launch. For example, the following kernel would
add the vector a to the vector b and store the output in out (assuming all
vectors are of dimension n):
__global__ void add_arrays(
const float* a,
const float* b,
float* out,
size_t n
)
namespace cuda
{
class grid_stride_range
{
/*!
WHAT THIS OBJECT REPRESENTS
This is a tool for making a for loop that loops over an entire block of
memory inside a kernel, but doing so in a way that parallelizes
appropriately across all the threads in a kernel launch. For example,
the following kernel would add the vector a to the vector b and store
the output in out (assuming all vectors are of dimension n):
__global__ void add_arrays(
const float* a,
const float* b,
float* out,
size_t n
)
{
for (auto i : grid_stride_range(0, n))
{
out[i] = a[i]+b[i];
}
}
!*/
public:
__device__ grid_stride_range(
size_t ibegin_,
size_t iend_
) :
ibegin(ibegin_),
iend(iend_)
{}
class iterator
{
public:
__device__ iterator() {}
__device__ iterator(size_t pos_) : pos(pos_) {}
__device__ size_t operator*() const
{
for (auto i : grid_stride_range(0, n))
{
out[i] = a[i]+b[i];
}
return pos;
}
!*/
public:
__device__ grid_stride_range(
size_t ibegin_,
size_t iend_
) :
ibegin(ibegin_),
iend(iend_)
{}
__device__ iterator& operator++()
{
pos += gridDim.x * blockDim.x;
return *this;
}
class iterator
{
public:
__device__ iterator() {}
__device__ iterator(size_t pos_) : pos(pos_) {}
__device__ bool operator!=(const iterator& item) const
{ return pos < item.pos; }
__device__ size_t operator*() const
{
return pos;
}
private:
size_t pos;
};
__device__ iterator& operator++()
{
pos += gridDim.x * blockDim.x;
return *this;
}
__device__ iterator begin() const
{
return iterator(ibegin+blockDim.x * blockIdx.x + threadIdx.x);
}
__device__ iterator end() const
{
return iterator(iend);
}
private:
__device__ bool operator!=(const iterator& item) const
{ return pos < item.pos; }
size_t ibegin;
size_t iend;
};
private:
size_t pos;
};
__device__ iterator begin() const
{
return iterator(ibegin+blockDim.x * blockIdx.x + threadIdx.x);
}
__device__ iterator end() const
{
return iterator(iend);
}
private:
size_t ibegin;
size_t iend;
};
}
#endif // __CUDACC__

View File

@ -15,133 +15,6 @@
namespace dlib
{
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
// gpu_data member functions
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
// TODO, add error handling
void gpu_data::
wait_for_transfer_to_finish() const
{
if (have_active_transfer)
{
std::cout << "wait for cudaStreamSynchronize()" << std::endl;
CHECK_CUDA(cudaStreamSynchronize((cudaStream_t)cuda_stream.get()));
have_active_transfer = false;
// Check for errors. These calls to cudaGetLastError() are what help us find
// out if our kernel launches have been failing.
CHECK_CUDA(cudaGetLastError());
}
}
void gpu_data::
copy_to_device() const
{
wait_for_transfer_to_finish();
if (!device_current)
{
std::cout << "cudaMemcpy to device" << std::endl;
CHECK_CUDA(cudaMemcpy(data_device.get(), data_host.get(), data_size*sizeof(float), cudaMemcpyHostToDevice));
device_current = true;
// Check for errors. These calls to cudaGetLastError() are what help us find
// out if our kernel launches have been failing.
CHECK_CUDA(cudaGetLastError());
}
}
void gpu_data::
copy_to_host() const
{
wait_for_transfer_to_finish();
if (!host_current)
{
std::cout << "cudaMemcpy to host" << std::endl;
CHECK_CUDA(cudaMemcpy(data_host.get(), data_device.get(), data_size*sizeof(float), cudaMemcpyDeviceToHost));
host_current = true;
// Check for errors. These calls to cudaGetLastError() are what help us find
// out if our kernel launches have been failing.
CHECK_CUDA(cudaGetLastError());
}
}
void gpu_data::
async_copy_to_device()
{
if (!device_current)
{
std::cout << "cudaMemcpyAsync to device" << std::endl;
CHECK_CUDA(cudaMemcpyAsync(data_device.get(), data_host.get(), data_size*sizeof(float), cudaMemcpyHostToDevice, (cudaStream_t)cuda_stream.get()));
have_active_transfer = true;
device_current = true;
}
}
void gpu_data::
set_size(
size_t new_size
)
{
wait_for_transfer_to_finish();
if (new_size == 0)
{
data_size = 0;
host_current = true;
device_current = true;
data_host.reset();
data_device.reset();
}
else if (new_size != data_size)
{
data_size = new_size;
host_current = true;
device_current = true;
try
{
void* data;
CHECK_CUDA(cudaMallocHost(&data, new_size*sizeof(float)));
// Note that we don't throw exceptions since the free calls are invariably
// called in destructors. They also shouldn't fail anyway unless someone
// is resetting the GPU card in the middle of their program.
data_host.reset((float*)data, [](float* ptr){
auto err = cudaFreeHost(ptr);
if(err!=cudaSuccess)
std::cerr << "cudaFreeHost() failed. Reason: " << cudaGetErrorString(err) << std::endl;
});
CHECK_CUDA(cudaMalloc(&data, new_size*sizeof(float)));
data_device.reset((float*)data, [](float* ptr){
auto err = cudaFree(ptr);
if(err!=cudaSuccess)
std::cerr << "cudaFree() failed. Reason: " << cudaGetErrorString(err) << std::endl;
});
if (!cuda_stream)
{
cudaStream_t cstream;
CHECK_CUDA(cudaStreamCreateWithFlags(&cstream, cudaStreamNonBlocking));
cuda_stream.reset(cstream, [](void* ptr){
auto err = cudaStreamDestroy((cudaStream_t)ptr);
if(err!=cudaSuccess)
std::cerr << "cudaStreamDestroy() failed. Reason: " << cudaGetErrorString(err) << std::endl;
});
}
}
catch(...)
{
set_size(0);
throw;
}
}
}
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
namespace cuda
{
@ -155,6 +28,8 @@ namespace dlib
throw cudnn_error("CUDA Runtime API initialization failed.");
case CUDNN_STATUS_ALLOC_FAILED:
throw cudnn_error("CUDA Resources could not be allocated.");
case CUDNN_STATUS_BAD_PARAM:
throw cudnn_error("CUDNN_STATUS_BAD_PARAM");
default:
throw cudnn_error("A call to cuDNN failed.");
}
@ -180,20 +55,16 @@ namespace dlib
// ------------------------------------------------------------------------------------
tensor_descriptor::tensor_descriptor() : handle(nullptr)
tensor_descriptor::
tensor_descriptor(
) : handle(nullptr)
{
cudnnTensorDescriptor_t h;
check(cudnnCreateTensorDescriptor(&h));
handle = h;
}
tensor_descriptor::~tensor_descriptor()
tensor_descriptor::
~tensor_descriptor()
{
if (handle)
{
cudnnDestroyTensorDescriptor((cudnnTensorDescriptor_t)handle);
handle = nullptr;
}
set_size(0,0,0,0);
}
void tensor_descriptor::
@ -204,13 +75,28 @@ namespace dlib
int k
)
{
check(cudnnSetTensor4dDescriptor((cudnnTensorDescriptor_t)handle,
CUDNN_TENSOR_NHWC,
CUDNN_DATA_FLOAT,
n,
k,
nr,
nc));
if (n == 0 || nr == 0 || nc == 0 || k == 0)
{
if (handle)
{
cudnnDestroyTensorDescriptor((cudnnTensorDescriptor_t)handle);
handle = nullptr;
}
}
else
{
cudnnTensorDescriptor_t h;
check(cudnnCreateTensorDescriptor(&h));
handle = h;
check(cudnnSetTensor4dDescriptor((cudnnTensorDescriptor_t)handle,
CUDNN_TENSOR_NHWC,
CUDNN_DATA_FLOAT,
n,
k,
nr,
nc));
}
}
void tensor_descriptor::
@ -221,18 +107,28 @@ namespace dlib
int& k
) const
{
int nStride, cStride, hStride, wStride;
cudnnDataType_t datatype;
check(cudnnGetTensor4dDescriptor((cudnnTensorDescriptor_t)handle,
&datatype,
&n,
&k,
&nr,
&nc,
&nStride,
&cStride,
&hStride,
&wStride));
if (handle)
{
int nStride, cStride, hStride, wStride;
cudnnDataType_t datatype;
check(cudnnGetTensor4dDescriptor((cudnnTensorDescriptor_t)handle,
&datatype,
&n,
&k,
&nr,
&nc,
&nStride,
&cStride,
&hStride,
&wStride));
}
else
{
n = 0;
nr = 0;
nc = 0;
k = 0;
}
}
// ------------------------------------------------------------------------------------

View File

@ -25,8 +25,19 @@ namespace dlib
cudnn_context(const cudnn_context&) = delete;
cudnn_context& operator=(const cudnn_context&) = delete;
// but is movable
cudnn_context(cudnn_context&&) = default;
cudnn_context& operator=(cudnn_context&&) = default;
cudnn_context(cudnn_context&& item)
{
handle = item.handle;
item.handle = nullptr;
}
cudnn_context& operator=(cudnn_context&& item)
{
if (this == &item)
return *this;
handle = item.handle;
item.handle = nullptr;
return *this;
}
cudnn_context();
~cudnn_context();
@ -53,8 +64,19 @@ namespace dlib
tensor_descriptor(const tensor_descriptor&) = delete;
tensor_descriptor& operator=(const tensor_descriptor&) = delete;
// but is movable
tensor_descriptor(tensor_descriptor&&) = default;
tensor_descriptor& operator=(tensor_descriptor&&) = default;
tensor_descriptor(tensor_descriptor&& item)
{
handle = item.handle;
item.handle = nullptr;
}
tensor_descriptor& operator=(tensor_descriptor&& item)
{
if (this == &item)
return *this;
handle = item.handle;
item.handle = nullptr;
return *this;
}
tensor_descriptor();
~tensor_descriptor();

142
dlib/dnn/gpu_data.cpp Normal file
View File

@ -0,0 +1,142 @@
// Copyright (C) 2015 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#ifndef DLIB_GPU_DaTA_CPP_
#define DLIB_GPU_DaTA_CPP_
// Only things that require CUDA are declared in this cpp file. Everything else is in the
// gpu_data.h header so that it can operate as "header-only" code when using just the CPU.
#ifdef DLIB_USE_CUDA
#include "gpu_data.h"
#include <iostream>
#include "cuda_utils.h"
namespace dlib
{
// ----------------------------------------------------------------------------------------
void gpu_data::
wait_for_transfer_to_finish() const
{
if (have_active_transfer)
{
std::cout << "wait for cudaStreamSynchronize()" << std::endl;
CHECK_CUDA(cudaStreamSynchronize((cudaStream_t)cuda_stream.get()));
have_active_transfer = false;
// Check for errors. These calls to cudaGetLastError() are what help us find
// out if our kernel launches have been failing.
CHECK_CUDA(cudaGetLastError());
}
}
void gpu_data::
copy_to_device() const
{
wait_for_transfer_to_finish();
if (!device_current)
{
std::cout << "cudaMemcpy to device" << std::endl;
CHECK_CUDA(cudaMemcpy(data_device.get(), data_host.get(), data_size*sizeof(float), cudaMemcpyHostToDevice));
device_current = true;
// Check for errors. These calls to cudaGetLastError() are what help us find
// out if our kernel launches have been failing.
CHECK_CUDA(cudaGetLastError());
}
}
void gpu_data::
copy_to_host() const
{
wait_for_transfer_to_finish();
if (!host_current)
{
std::cout << "cudaMemcpy to host" << std::endl;
CHECK_CUDA(cudaMemcpy(data_host.get(), data_device.get(), data_size*sizeof(float), cudaMemcpyDeviceToHost));
host_current = true;
// Check for errors. These calls to cudaGetLastError() are what help us find
// out if our kernel launches have been failing.
CHECK_CUDA(cudaGetLastError());
}
}
void gpu_data::
async_copy_to_device()
{
if (!device_current)
{
std::cout << "cudaMemcpyAsync to device" << std::endl;
CHECK_CUDA(cudaMemcpyAsync(data_device.get(), data_host.get(), data_size*sizeof(float), cudaMemcpyHostToDevice, (cudaStream_t)cuda_stream.get()));
have_active_transfer = true;
device_current = true;
}
}
void gpu_data::
set_size(
size_t new_size
)
{
wait_for_transfer_to_finish();
if (new_size == 0)
{
data_size = 0;
host_current = true;
device_current = true;
data_host.reset();
data_device.reset();
}
else if (new_size != data_size)
{
data_size = new_size;
host_current = true;
device_current = true;
try
{
void* data;
CHECK_CUDA(cudaMallocHost(&data, new_size*sizeof(float)));
// Note that we don't throw exceptions since the free calls are invariably
// called in destructors. They also shouldn't fail anyway unless someone
// is resetting the GPU card in the middle of their program.
data_host.reset((float*)data, [](float* ptr){
auto err = cudaFreeHost(ptr);
if(err!=cudaSuccess)
std::cerr << "cudaFreeHost() failed. Reason: " << cudaGetErrorString(err) << std::endl;
});
CHECK_CUDA(cudaMalloc(&data, new_size*sizeof(float)));
data_device.reset((float*)data, [](float* ptr){
auto err = cudaFree(ptr);
if(err!=cudaSuccess)
std::cerr << "cudaFree() failed. Reason: " << cudaGetErrorString(err) << std::endl;
});
if (!cuda_stream)
{
cudaStream_t cstream;
CHECK_CUDA(cudaStreamCreateWithFlags(&cstream, cudaStreamNonBlocking));
cuda_stream.reset(cstream, [](void* ptr){
auto err = cudaStreamDestroy((cudaStream_t)ptr);
if(err!=cudaSuccess)
std::cerr << "cudaStreamDestroy() failed. Reason: " << cudaGetErrorString(err) << std::endl;
});
}
}
catch(...)
{
set_size(0);
throw;
}
}
}
// ----------------------------------------------------------------------------------------
}
#endif // DLIB_USE_CUDA
#endif // DLIB_GPU_DaTA_CPP_

168
dlib/dnn/gpu_data.h Normal file
View File

@ -0,0 +1,168 @@
// Copyright (C) 2015 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#ifndef DLIB_GPU_DaTA_H_
#define DLIB_GPU_DaTA_H_
#include <memory>
#include "cuda_errors.h"
#include "../serialize.h"
namespace dlib
{
// ----------------------------------------------------------------------------------------
class gpu_data
{
/*!
CONVENTION
- if (size() != 0) then
- data_host == a pointer to size() floats in CPU memory.
- if (data_device) then
- data_device == a pointer to size() floats in device memory.
- if (there might be an active transfer between host and device) then
- have_active_transfer == true
- We use the host_current and device_current bools to keep track of which
copy of the data (or both) are most current. e.g. if the CPU has
modified the tensor and it hasn't been copied to the device yet then
host_current==true and device_current == false.
THREAD SAFETY
This object is not thread-safe. Don't touch it from multiple threads as the
same time.
!*/
public:
gpu_data(
) : data_size(0), host_current(true), device_current(true),have_active_transfer(false)
{
}
// Not copyable
gpu_data(const gpu_data&) = delete;
gpu_data& operator=(const gpu_data&) = delete;
// but is movable
gpu_data(gpu_data&&) = default;
gpu_data& operator=(gpu_data&&) = default;
#ifdef DLIB_USE_CUDA
void async_copy_to_device();
void set_size(size_t new_size);
#else
// Note that calls to host() or device() will block until any async transfers are complete.
void async_copy_to_device(){}
void set_size(size_t new_size)
{
if (new_size == 0)
{
data_size = 0;
host_current = true;
device_current = true;
data_host.reset();
data_device.reset();
}
else if (new_size != data_size)
{
data_size = new_size;
host_current = true;
device_current = true;
data_host.reset(new float[new_size], std::default_delete<float[]>());
data_device.reset();
}
}
#endif
const float* host() const
{
copy_to_host();
return data_host.get();
}
float* host()
{
copy_to_host();
device_current = false;
return data_host.get();
}
const float* device() const
{
#ifndef DLIB_USE_CUDA
DLIB_CASSERT(false, "CUDA NOT ENABLED");
#endif
copy_to_device();
return data_device.get();
}
float* device()
{
#ifndef DLIB_USE_CUDA
DLIB_CASSERT(false, "CUDA NOT ENABLED");
#endif
copy_to_device();
host_current = false;
return data_device.get();
}
size_t size() const { return data_size; }
private:
#ifdef DLIB_USE_CUDA
void copy_to_device() const;
void copy_to_host() const;
void wait_for_transfer_to_finish() const;
#else
void copy_to_device() const{}
void copy_to_host() const{}
void wait_for_transfer_to_finish() const{}
#endif
size_t data_size;
mutable bool host_current;
mutable bool device_current;
mutable bool have_active_transfer;
std::shared_ptr<float> data_host;
std::shared_ptr<float> data_device;
std::shared_ptr<void> cuda_stream;
};
inline void serialize(const gpu_data& item, std::ostream& out)
{
int version = 1;
serialize(version, out);
serialize(item.size(), out);
auto data = item.host();
for (size_t i = 0; i < item.size(); ++i)
serialize(data[i], out);
}
inline void deserialize(gpu_data& item, std::istream& in)
{
int version;
deserialize(version, in);
if (version != 1)
throw serialization_error("Unexpected version found while deserializing dlib::gpu_data.");
size_t s;
deserialize(s, in);
item.set_size(s);
auto data = item.host();
for (size_t i = 0; i < item.size(); ++i)
deserialize(data[i], in);
}
// ----------------------------------------------------------------------------------------
}
#endif // DLIB_GPU_DaTA_H_

View File

@ -3,165 +3,13 @@
#ifndef DLIB_DNn_TENSOR_H_
#define DLIB_DNn_TENSOR_H_
#include <memory>
#include <cstring>
#include "../matrix.h"
#include "cudnn_dlibapi.h"
#include "gpu_data.h"
namespace dlib
{
// ----------------------------------------------------------------------------------------
class gpu_data
{
/*!
CONVENTION
- if (size() != 0) then
- data_host == a pointer to size() floats in CPU memory.
- if (data_device) then
- data_device == a pointer to size() floats in device memory.
- if (there might be an active transfer between host and device) then
- have_active_transfer == true
- We use the host_current and device_current bools to keep track of which
copy of the data (or both) are most current. e.g. if the CPU has
modified the tensor and it hasn't been copied to the device yet then
host_current==true and device_current == false.
THREAD SAFETY
This object is not thread-safe. Don't touch it from multiple threads as the
same time.
!*/
public:
gpu_data(
) : data_size(0), host_current(true), device_current(true),have_active_transfer(false)
{
}
// Not copyable
gpu_data(const gpu_data&) = delete;
gpu_data& operator=(const gpu_data&) = delete;
// but is movable
gpu_data(gpu_data&&) = default;
gpu_data& operator=(gpu_data&&) = default;
#ifdef DLIB_USE_CUDA
void async_copy_to_device();
void set_size(size_t new_size);
#else
// Note that calls to host() or device() will block until any async transfers are complete.
void async_copy_to_device(){}
void set_size(size_t new_size)
{
if (new_size == 0)
{
data_size = 0;
host_current = true;
device_current = true;
data_host.reset();
data_device.reset();
}
else if (new_size != data_size)
{
data_size = new_size;
host_current = true;
device_current = true;
data_host.reset(new float[new_size], std::default_delete<float[]>());
data_device.reset();
}
}
#endif
const float* host() const
{
copy_to_host();
return data_host.get();
}
float* host()
{
copy_to_host();
device_current = false;
return data_host.get();
}
const float* device() const
{
#ifndef DLIB_USE_CUDA
DLIB_CASSERT(false, "CUDA NOT ENABLED");
#endif
copy_to_device();
return data_device.get();
}
float* device()
{
#ifndef DLIB_USE_CUDA
DLIB_CASSERT(false, "CUDA NOT ENABLED");
#endif
copy_to_device();
host_current = false;
return data_device.get();
}
size_t size() const { return data_size; }
private:
#ifdef DLIB_USE_CUDA
void copy_to_device() const;
void copy_to_host() const;
void wait_for_transfer_to_finish() const;
#else
void copy_to_device() const{}
void copy_to_host() const{}
void wait_for_transfer_to_finish() const{}
#endif
size_t data_size;
mutable bool host_current;
mutable bool device_current;
mutable bool have_active_transfer;
std::shared_ptr<float> data_host;
std::shared_ptr<float> data_device;
std::shared_ptr<void> cuda_stream;
};
inline void serialize(const gpu_data& item, std::ostream& out)
{
int version = 1;
serialize(version, out);
serialize(item.size(), out);
auto data = item.host();
for (size_t i = 0; i < item.size(); ++i)
serialize(data[i], out);
}
inline void deserialize(gpu_data& item, std::istream& in)
{
int version;
deserialize(version, in);
if (version != 1)
throw serialization_error("Unexpected version found while deserializing dlib::gpu_data.");
size_t s;
deserialize(s, in);
item.set_size(s);
auto data = item.host();
for (size_t i = 0; i < item.size(); ++i)
deserialize(data[i], in);
}
// ----------------------------------------------------------------------------------------
class tensor