diff --git a/dlib/dnn/gpu_data.cpp b/dlib/dnn/gpu_data.cpp
index 2f0b105db..089f8480b 100644
--- a/dlib/dnn/gpu_data.cpp
+++ b/dlib/dnn/gpu_data.cpp
@@ -136,6 +136,8 @@ namespace dlib
 
             try
             {
+                CHECK_CUDA(cudaGetDevice(&the_device_id));
+
                 void* data;
                 CHECK_CUDA(cudaMallocHost(&data, new_size*sizeof(float)));
                 // Note that we don't throw exceptions since the free calls are invariably
diff --git a/dlib/dnn/gpu_data.h b/dlib/dnn/gpu_data.h
index 398fb59b0..15db5ab3d 100644
--- a/dlib/dnn/gpu_data.h
+++ b/dlib/dnn/gpu_data.h
@@ -40,7 +40,7 @@ namespace dlib
     public:
 
         gpu_data(
-        ) : data_size(0), host_current(true), device_current(true),have_active_transfer(false),device_in_use(false)
+        ) : data_size(0), host_current(true), device_current(true),have_active_transfer(false),device_in_use(false), the_device_id(0)
         {
         }
 
@@ -52,6 +52,7 @@ namespace dlib
         gpu_data(gpu_data&& item) : gpu_data() { swap(item); }
         gpu_data& operator=(gpu_data&& item) { swap(item); return *this; }
 
+        int device_id() const { return the_device_id; }
 
 #ifdef DLIB_USE_CUDA
         void async_copy_to_device() const; 
@@ -153,6 +154,7 @@ namespace dlib
             std::swap(data_host, item.data_host);
             std::swap(data_device, item.data_device);
             std::swap(cuda_stream, item.cuda_stream);
+            std::swap(the_device_id, item.the_device_id);
         }
 
     private:
@@ -177,6 +179,7 @@ namespace dlib
         std::shared_ptr<float> data_host;
         std::shared_ptr<float> data_device;
         std::shared_ptr<void> cuda_stream;
+        int the_device_id;
     };
 
     inline void serialize(const gpu_data& item, std::ostream& out)
diff --git a/dlib/dnn/gpu_data_abstract.h b/dlib/dnn/gpu_data_abstract.h
index 349eb6d1c..09b274094 100644
--- a/dlib/dnn/gpu_data_abstract.h
+++ b/dlib/dnn/gpu_data_abstract.h
@@ -45,6 +45,7 @@ namespace dlib
                 - #device() == nullptr 
                 - #host_ready() == true
                 - #device_ready() == true
+                - #device_id() == 0
         !*/
 
         // This object is not copyable, however, it is movable.
@@ -53,6 +54,14 @@ namespace dlib
         gpu_data(gpu_data&& item);
         gpu_data& operator=(gpu_data&& item);
 
+        int device_id(
+        ) const; 
+        /*!
+            ensures
+                - returns the ID of the CUDA device that allocated this memory. I.e. the
+                  number returned by cudaGetDevice() when the memory was allocated.
+                - If CUDA is not being used then this function always returns 0.
+        !*/
 
         void async_copy_to_device(
         ); 
diff --git a/dlib/dnn/tensor.h b/dlib/dnn/tensor.h
index b077ef45f..13e3deec9 100644
--- a/dlib/dnn/tensor.h
+++ b/dlib/dnn/tensor.h
@@ -52,6 +52,8 @@ namespace dlib
         virtual float*       device() = 0;
         virtual float*       device_write_only() = 0;
 
+        int device_id() const { return data().device_id(); }
+
         tensor& operator= (float val)
         {
 #ifdef DLIB_USE_CUDA
diff --git a/dlib/dnn/tensor_abstract.h b/dlib/dnn/tensor_abstract.h
index 56a32dc9d..8d59c322f 100644
--- a/dlib/dnn/tensor_abstract.h
+++ b/dlib/dnn/tensor_abstract.h
@@ -187,6 +187,15 @@ namespace dlib
                   every memory location in the returned memory block.  
         !*/
 
+        int device_id(
+        ) const; 
+        /*!
+            ensures
+                - returns the ID of the CUDA device that allocated this memory. I.e. the
+                  number returned by cudaGetDevice() when the memory was allocated.
+                - If CUDA is not being used then this function always returns 0.
+        !*/
+
         tensor& operator= (
             float val
         );