From 0cb81e5f50ab396726a334784db308025d74c2b7 Mon Sep 17 00:00:00 2001 From: AlexeyAB Date: Sun, 7 Jan 2018 20:44:04 +0300 Subject: [PATCH] Some stream fixes --- src/cuda.h | 1 + src/network_kernels.cu | 4 +++- src/yolo_console_dll.cpp | 3 +-- src/yolo_v2_class.cpp | 8 +++++--- src/yolo_v2_class.hpp | 25 ++++++++++++++----------- 5 files changed, 24 insertions(+), 17 deletions(-) diff --git a/src/cuda.h b/src/cuda.h index 31f9092a..31577ff3 100644 --- a/src/cuda.h +++ b/src/cuda.h @@ -26,6 +26,7 @@ int *cuda_make_int_array(size_t n); void cuda_push_array(float *x_gpu, float *x, size_t n); void cuda_pull_array(float *x_gpu, float *x, size_t n); void cuda_set_device(int n); +int cuda_get_device(); void cuda_free(float *x_gpu); void cuda_random(float *x_gpu, size_t n); float cuda_compare(float *x_gpu, float *x, size_t n, char *s); diff --git a/src/network_kernels.cu b/src/network_kernels.cu index 7a261c57..341c7f8f 100644 --- a/src/network_kernels.cu +++ b/src/network_kernels.cu @@ -51,6 +51,7 @@ void forward_network_gpu(network net, network_state state) fill_ongpu(l.outputs * l.batch, 0, l.delta_gpu, 1); } l.forward_gpu(l, state); + cudaStreamSynchronize(get_cuda_stream()); state.input = l.output_gpu; } } @@ -392,7 +393,8 @@ float *get_network_output_gpu(network net) float *network_predict_gpu(network net, float *input) { - cuda_set_device(net.gpu_index); + if (net.gpu_index != cuda_get_device()) + cuda_set_device(net.gpu_index); int size = get_network_input_size(net) * net.batch; network_state state; state.index = 0; diff --git a/src/yolo_console_dll.cpp b/src/yolo_console_dll.cpp index 0b475548..abf4bdc8 100644 --- a/src/yolo_console_dll.cpp +++ b/src/yolo_console_dll.cpp @@ -158,7 +158,6 @@ int main(int argc, char *argv[]) det_image = detector.mat_to_image_resize(cur_frame); result_vec = thread_result_vec; result_vec = detector.tracking(result_vec); // comment it - if track_id is not required - #ifdef TRACK_OPTFLOW // track optical flow if (track_optflow_queue.size() > 0) { @@ -189,7 +188,7 @@ int main(int argc, char *argv[]) //std::vector result; auto result = detector.detect_resized(*current_image, frame_size, 0.24, false); // true //Sleep(200); - Sleep(50); + //Sleep(50); ++fps_det_counter; std::unique_lock lock(mtx); thread_result_vec = result; diff --git a/src/yolo_v2_class.cpp b/src/yolo_v2_class.cpp index 1777aa22..88d2bc12 100644 --- a/src/yolo_v2_class.cpp +++ b/src/yolo_v2_class.cpp @@ -34,7 +34,7 @@ struct detector_gpu_t{ }; -YOLODLL_API Detector::Detector(std::string cfg_filename, std::string weight_filename, int gpu_id) +YOLODLL_API Detector::Detector(std::string cfg_filename, std::string weight_filename, int gpu_id) : cur_gpu_id(gpu_id) { int old_gpu_index; #ifdef GPU @@ -178,7 +178,8 @@ YOLODLL_API std::vector Detector::detect(image_t img, float thresh, bool int old_gpu_index; #ifdef GPU cudaGetDevice(&old_gpu_index); - cudaSetDevice(net.gpu_index); + if(cur_gpu_id != old_gpu_index) + cudaSetDevice(net.gpu_index); #endif //std::cout << "net.gpu_index = " << net.gpu_index << std::endl; @@ -242,7 +243,8 @@ YOLODLL_API std::vector Detector::detect(image_t img, float thresh, bool free(sized.data); #ifdef GPU - cudaSetDevice(old_gpu_index); + if (cur_gpu_id != old_gpu_index) + cudaSetDevice(old_gpu_index); #endif return bbox_vec; diff --git a/src/yolo_v2_class.hpp b/src/yolo_v2_class.hpp index edb24cbf..392b71ae 100644 --- a/src/yolo_v2_class.hpp +++ b/src/yolo_v2_class.hpp @@ -47,6 +47,7 @@ struct image_t { class Detector { std::shared_ptr detector_gpu_ptr; std::deque> prev_bbox_vec_deque; + const int cur_gpu_id; public: float nms = .4; @@ -170,8 +171,8 @@ public: sync_PyrLKOpticalFlow_gpu = cv::cuda::SparsePyrLKOpticalFlow::create(); sync_PyrLKOpticalFlow_gpu->setWinSize(cv::Size(21, 21)); // 15, 21, 31 - sync_PyrLKOpticalFlow_gpu->setMaxLevel(5); // +- 50 ptx - sync_PyrLKOpticalFlow_gpu->setNumIters(2000); // def: 30 + sync_PyrLKOpticalFlow_gpu->setMaxLevel(3); // +- 5 ptx + sync_PyrLKOpticalFlow_gpu->setNumIters(1000); // def: 30 cv::cuda::setDevice(old_gpu_id); } @@ -190,9 +191,8 @@ public: void update_tracking_flow(cv::Mat src_mat) { int const old_gpu_id = cv::cuda::getDevice(); - cv::cuda::setDevice(gpu_id); - - //cv::cuda::Stream stream; + if (old_gpu_id != gpu_id) + cv::cuda::setDevice(gpu_id); if (src_mat.channels() == 3) { if (src_mat_gpu.cols == 0) { @@ -203,7 +203,8 @@ public: src_mat_gpu.upload(src_mat, stream); cv::cuda::cvtColor(src_mat_gpu, src_grey_gpu, CV_BGR2GRAY, 0, stream); } - cv::cuda::setDevice(old_gpu_id); + if (old_gpu_id != gpu_id) + cv::cuda::setDevice(old_gpu_id); } @@ -215,9 +216,8 @@ public: } int const old_gpu_id = cv::cuda::getDevice(); - cv::cuda::setDevice(gpu_id); - - //cv::cuda::Stream stream; + if(old_gpu_id != gpu_id) + cv::cuda::setDevice(gpu_id); if (dst_mat_gpu.cols == 0) { dst_mat_gpu = cv::cuda::GpuMat(dst_mat.size(), dst_mat.type()); @@ -225,9 +225,9 @@ public: tmp_grey_gpu = cv::cuda::GpuMat(dst_mat.size(), CV_8UC1); } - dst_mat_gpu.upload(dst_mat, stream); + cv::cuda::cvtColor(dst_mat_gpu, dst_grey_gpu, CV_BGR2GRAY, 0, stream); if (src_grey_gpu.rows != dst_grey_gpu.rows || src_grey_gpu.cols != dst_grey_gpu.cols) { @@ -237,6 +237,8 @@ public: return cur_bbox_vec; } + //return cur_bbox_vec; + cv::Mat prev_pts, prev_pts_flow_cpu, cur_pts_flow_cpu; for (auto &i : cur_bbox_vec) { @@ -298,7 +300,8 @@ public: } } - cv::cuda::setDevice(old_gpu_id); + if (old_gpu_id != gpu_id) + cv::cuda::setDevice(old_gpu_id); return result_bbox_vec; }