From c87e33eb0662bd1b6883fd37b40e64f5a9e389ac Mon Sep 17 00:00:00 2001
From: Cristi Fati <fati_utcluj@yahoo.com>
Date: Sat, 26 Aug 2023 02:56:16 +0300
Subject: [PATCH] Fix thread synchronization bug + many more (#8562)

* Minor: float fps (useful for small values - e.g. CPU)
* Fix multiple code readability issues
* Better synchoronize threads (when user interrupts video)
* Better thread objects handling
* Pass data via arguments instead of global like variables - 0
* Minor code readability fixes
* CTypes definitions
* Pass data via arguments instead of global like variables - 1
* Code reordering + (minor) renames
* Pass data via arguments instead of global like variables - 2
---
 darknet.py       | 185 +++++++++++++++++++++++++++--------------------
 darknet_video.py | 164 ++++++++++++++++++++++++-----------------
 2 files changed, 203 insertions(+), 146 deletions(-)

diff --git a/darknet.py b/darknet.py
index ebb0eede..603ecae9 100644
--- a/darknet.py
+++ b/darknet.py
@@ -8,49 +8,71 @@ Directly viewing or returning bounding-boxed images requires scikit-image to be
 Use pip3 instead of pip on some systems to be sure to install modules for python3
 """
 
-from ctypes import *
-import math
+import ctypes as ct
 import random
 import os
+import cv2
+import numpy as np
 
 
-class BOX(Structure):
-    _fields_ = [("x", c_float),
-                ("y", c_float),
-                ("w", c_float),
-                ("h", c_float)]
+class BOX(ct.Structure):
+    _fields_ = (
+        ("x", ct.c_float),
+        ("y", ct.c_float),
+        ("w", ct.c_float),
+        ("h", ct.c_float),
+    )
 
 
-class DETECTION(Structure):
-    _fields_ = [("bbox", BOX),
-                ("classes", c_int),
-                ("best_class_idx", c_int),
-                ("prob", POINTER(c_float)),
-                ("mask", POINTER(c_float)),
-                ("objectness", c_float),
-                ("sort_class", c_int),
-                ("uc", POINTER(c_float)),
-                ("points", c_int),
-                ("embeddings", POINTER(c_float)),
-                ("embedding_size", c_int),
-                ("sim", c_float),
-                ("track_id", c_int)]
-
-class DETNUMPAIR(Structure):
-    _fields_ = [("num", c_int),
-                ("dets", POINTER(DETECTION))]
+FloatPtr = ct.POINTER(ct.c_float)
+IntPtr = ct.POINTER(ct.c_int)
 
 
-class IMAGE(Structure):
-    _fields_ = [("w", c_int),
-                ("h", c_int),
-                ("c", c_int),
-                ("data", POINTER(c_float))]
+class DETECTION(ct.Structure):
+    _fields_ = (
+        ("bbox", BOX),
+        ("classes", ct.c_int),
+        ("best_class_idx", ct.c_int),
+        ("prob", FloatPtr),
+        ("mask", FloatPtr),
+        ("objectness", ct.c_float),
+        ("sort_class", ct.c_int),
+        ("uc", FloatPtr),
+        ("points", ct.c_int),
+        ("embeddings", FloatPtr),
+        ("embedding_size", ct.c_int),
+        ("sim", ct.c_float),
+        ("track_id", ct.c_int),
+    )
 
 
-class METADATA(Structure):
-    _fields_ = [("classes", c_int),
-                ("names", POINTER(c_char_p))]
+DETECTIONPtr = ct.POINTER(DETECTION)
+
+
+class DETNUMPAIR(ct.Structure):
+    _fields_ = (
+        ("num", ct.c_int),
+        ("dets", DETECTIONPtr),
+    )
+
+
+DETNUMPAIRPtr = ct.POINTER(DETNUMPAIR)
+
+
+class IMAGE(ct.Structure):
+    _fields_ = (
+        ("w", ct.c_int),
+        ("h", ct.c_int),
+        ("c", ct.c_int),
+        ("data", FloatPtr),
+    )
+
+
+class METADATA(ct.Structure):
+    _fields_ = (
+        ("classes", ct.c_int),
+        ("names", ct.POINTER(ct.c_char_p)),
+    )
 
 
 def network_width(net):
@@ -67,10 +89,10 @@ def bbox2points(bbox):
     to corner points cv2 rectangle
     """
     x, y, w, h = bbox
-    xmin = int(round(x - (w / 2)))
-    xmax = int(round(x + (w / 2)))
-    ymin = int(round(y - (h / 2)))
-    ymax = int(round(y + (h / 2)))
+    xmin = round(x - (w / 2))
+    xmax = round(x + (w / 2))
+    ymin = round(y - (h / 2))
+    ymax = round(y + (h / 2))
     return xmin, ymin, xmax, ymax
 
 
@@ -134,6 +156,7 @@ def decode_detection(detections):
         decoded.append((str(label), confidence, bbox))
     return decoded
 
+
 # https://www.pyimagesearch.com/2015/02/16/faster-non-maximum-suppression-python/
 # Malisiewicz et al.
 def non_max_suppression_fast(detections, overlap_thresh):
@@ -185,6 +208,7 @@ def non_max_suppression_fast(detections, overlap_thresh):
         # integer data type
     return [detections[i] for i in pick]
 
+
 def remove_negatives(detections, class_names, num):
     """
     Remove all classes with 0% confidence within the detection
@@ -218,7 +242,7 @@ def detect_image(network, class_names, image, thresh=.5, hier_thresh=.5, nms=.45
     """
         Returns a list with highest confidence class and their bbox
     """
-    pnum = pointer(c_int(0))
+    pnum = ct.pointer(ct.c_int(0))
     predict_image(network, image)
     detections = get_network_boxes(network, image.w, image.h,
                                    thresh, hier_thresh, None, 0, pnum, 0)
@@ -233,102 +257,105 @@ def detect_image(network, class_names, image, thresh=.5, hier_thresh=.5, nms=.45
 
 if os.name == "posix":
     cwd = os.path.dirname(__file__)
-    lib = CDLL(cwd + "/libdarknet.so", RTLD_GLOBAL)
+    lib = ct.CDLL(cwd + "/libdarknet.so", ct.RTLD_GLOBAL)
 elif os.name == "nt":
     cwd = os.path.dirname(__file__)
-    os.environ['PATH'] = cwd + ';' + os.environ['PATH']
-    lib = CDLL("darknet.dll", RTLD_GLOBAL)
+    os.environ["PATH"] = os.path.pathsep.join((cwd, os.environ["PATH"]))
+    lib = ct.CDLL("darknet.dll", ct.RTLD_GLOBAL)
 else:
+    lib = None  # Intellisense
     print("Unsupported OS")
-    exit
+    exit()
 
-lib.network_width.argtypes = [c_void_p]
-lib.network_width.restype = c_int
-lib.network_height.argtypes = [c_void_p]
-lib.network_height.restype = c_int
+lib.network_width.argtypes = (ct.c_void_p,)
+lib.network_width.restype = ct.c_int
+lib.network_height.argtypes = (ct.c_void_p,)
+lib.network_height.restype = ct.c_int
 
 copy_image_from_bytes = lib.copy_image_from_bytes
-copy_image_from_bytes.argtypes = [IMAGE,c_char_p]
+copy_image_from_bytes.argtypes = (IMAGE, ct.c_char_p)
 
 predict = lib.network_predict_ptr
-predict.argtypes = [c_void_p, POINTER(c_float)]
-predict.restype = POINTER(c_float)
+predict.argtypes = (ct.c_void_p, FloatPtr)
+predict.restype = FloatPtr
 
 set_gpu = lib.cuda_set_device
 init_cpu = lib.init_cpu
 
 make_image = lib.make_image
-make_image.argtypes = [c_int, c_int, c_int]
+make_image.argtypes = (ct.c_int, ct.c_int, ct.c_int)
 make_image.restype = IMAGE
 
 get_network_boxes = lib.get_network_boxes
-get_network_boxes.argtypes = [c_void_p, c_int, c_int, c_float, c_float, POINTER(c_int), c_int, POINTER(c_int), c_int]
-get_network_boxes.restype = POINTER(DETECTION)
+get_network_boxes.argtypes = (ct.c_void_p, ct.c_int, ct.c_int, ct.c_float, ct.c_float, IntPtr, ct.c_int, IntPtr,
+                              ct.c_int)
+get_network_boxes.restype = DETECTIONPtr
 
 make_network_boxes = lib.make_network_boxes
-make_network_boxes.argtypes = [c_void_p]
-make_network_boxes.restype = POINTER(DETECTION)
+make_network_boxes.argtypes = (ct.c_void_p,)
+make_network_boxes.restype = DETECTIONPtr
 
 free_detections = lib.free_detections
-free_detections.argtypes = [POINTER(DETECTION), c_int]
+free_detections.argtypes = (DETECTIONPtr, ct.c_int)
 
 free_batch_detections = lib.free_batch_detections
-free_batch_detections.argtypes = [POINTER(DETNUMPAIR), c_int]
+free_batch_detections.argtypes = (DETNUMPAIRPtr, ct.c_int)
 
 free_ptrs = lib.free_ptrs
-free_ptrs.argtypes = [POINTER(c_void_p), c_int]
+free_ptrs.argtypes = (ct.POINTER(ct.c_void_p), ct.c_int)
 
 network_predict = lib.network_predict_ptr
-network_predict.argtypes = [c_void_p, POINTER(c_float)]
+network_predict.argtypes = (ct.c_void_p, FloatPtr)
 
 reset_rnn = lib.reset_rnn
-reset_rnn.argtypes = [c_void_p]
+reset_rnn.argtypes = (ct.c_void_p,)
 
 load_net = lib.load_network
-load_net.argtypes = [c_char_p, c_char_p, c_int]
-load_net.restype = c_void_p
+load_net.argtypes = (ct.c_char_p, ct.c_char_p, ct.c_int)
+load_net.restype = ct.c_void_p
 
 load_net_custom = lib.load_network_custom
-load_net_custom.argtypes = [c_char_p, c_char_p, c_int, c_int]
-load_net_custom.restype = c_void_p
+load_net_custom.argtypes = (ct.c_char_p, ct.c_char_p, ct.c_int, ct.c_int)
+load_net_custom.restype = ct.c_void_p
 
 free_network_ptr = lib.free_network_ptr
-free_network_ptr.argtypes = [c_void_p]
-free_network_ptr.restype = c_void_p
+free_network_ptr.argtypes = (ct.c_void_p,)
+free_network_ptr.restype = ct.c_void_p
 
 do_nms_obj = lib.do_nms_obj
-do_nms_obj.argtypes = [POINTER(DETECTION), c_int, c_int, c_float]
+do_nms_obj.argtypes = (DETECTIONPtr, ct.c_int, ct.c_int, ct.c_float)
 
 do_nms_sort = lib.do_nms_sort
-do_nms_sort.argtypes = [POINTER(DETECTION), c_int, c_int, c_float]
+do_nms_sort.argtypes = (DETECTIONPtr, ct.c_int, ct.c_int, ct.c_float)
 
 free_image = lib.free_image
-free_image.argtypes = [IMAGE]
+free_image.argtypes = (IMAGE,)
 
 letterbox_image = lib.letterbox_image
-letterbox_image.argtypes = [IMAGE, c_int, c_int]
+letterbox_image.argtypes = (IMAGE, ct.c_int, ct.c_int)
 letterbox_image.restype = IMAGE
 
 load_meta = lib.get_metadata
-lib.get_metadata.argtypes = [c_char_p]
+lib.get_metadata.argtypes = (ct.c_char_p,)
 lib.get_metadata.restype = METADATA
 
 load_image = lib.load_image_color
-load_image.argtypes = [c_char_p, c_int, c_int]
+load_image.argtypes = (ct.c_char_p, ct.c_int, ct.c_int)
 load_image.restype = IMAGE
 
 rgbgr_image = lib.rgbgr_image
-rgbgr_image.argtypes = [IMAGE]
+rgbgr_image.argtypes = (IMAGE,)
 
 predict_image = lib.network_predict_image
-predict_image.argtypes = [c_void_p, IMAGE]
-predict_image.restype = POINTER(c_float)
+predict_image.argtypes = (ct.c_void_p, IMAGE)
+predict_image.restype = FloatPtr
 
 predict_image_letterbox = lib.network_predict_image_letterbox
-predict_image_letterbox.argtypes = [c_void_p, IMAGE]
-predict_image_letterbox.restype = POINTER(c_float)
+predict_image_letterbox.argtypes = (ct.c_void_p, IMAGE)
+predict_image_letterbox.restype = FloatPtr
 
 network_predict_batch = lib.network_predict_batch
-network_predict_batch.argtypes = [c_void_p, IMAGE, c_int, c_int, c_int,
-                                   c_float, c_float, POINTER(c_int), c_int, c_int]
-network_predict_batch.restype = POINTER(DETNUMPAIR)
+network_predict_batch.argtypes = (ct.c_void_p, IMAGE, ct.c_int, ct.c_int, ct.c_int,
+                                  ct.c_float, ct.c_float, IntPtr, ct.c_int, ct.c_int)
+network_predict_batch.restype = DETNUMPAIRPtr
+
diff --git a/darknet_video.py b/darknet_video.py
index eab6e9e7..c6682a24 100644
--- a/darknet_video.py
+++ b/darknet_video.py
@@ -1,12 +1,11 @@
-from ctypes import *
 import random
 import os
 import cv2
 import time
 import darknet
 import argparse
-from threading import Thread, enumerate
-from queue import Queue
+import threading
+import queue
 
 
 def parser():
@@ -17,9 +16,9 @@ def parser():
                         help="inference video name. Not saved if empty")
     parser.add_argument("--weights", default="yolov4.weights",
                         help="yolo weights path")
-    parser.add_argument("--dont_show", action='store_true',
-                        help="windown inference display. For headless systems")
-    parser.add_argument("--ext_output", action='store_true',
+    parser.add_argument("--dont_show", action="store_true",
+                        help="window inference display. For headless systems")
+    parser.add_argument("--ext_output", action="store_true",
                         help="display bbox coordinates of detected objects")
     parser.add_argument("--config_file", default="./cfg/yolov4.cfg",
                         help="path to config file")
@@ -53,131 +52,162 @@ def check_arguments_errors(args):
         raise(ValueError("Invalid video path {}".format(os.path.abspath(args.input))))
 
 
-def set_saved_video(input_video, output_video, size):
+def set_saved_video(output_video, size, fps):
     fourcc = cv2.VideoWriter_fourcc(*"MJPG")
-    fps = int(input_video.get(cv2.CAP_PROP_FPS))
-    video = cv2.VideoWriter(output_video, fourcc, fps, size)
-    return video
+    return cv2.VideoWriter(output_video, fourcc, fps, size)
 
 
-def convert2relative(bbox):
+def convert2relative(bbox, preproc_h, preproc_w):
     """
     YOLO format use relative coordinates for annotation
     """
-    x, y, w, h  = bbox
-    _height     = darknet_height
-    _width      = darknet_width
-    return x/_width, y/_height, w/_width, h/_height
+    x, y, w, h = bbox
+    return x / preproc_w, y / preproc_h, w / preproc_w, h / preproc_h
 
 
-def convert2original(image, bbox):
-    x, y, w, h = convert2relative(bbox)
+def convert2original(image, bbox, preproc_h, preproc_w):
+    x, y, w, h = convert2relative(bbox, preproc_h, preproc_w)
 
     image_h, image_w, __ = image.shape
 
-    orig_x       = int(x * image_w)
-    orig_y       = int(y * image_h)
-    orig_width   = int(w * image_w)
-    orig_height  = int(h * image_h)
+    orig_x = int(x * image_w)
+    orig_y = int(y * image_h)
+    orig_width = int(w * image_w)
+    orig_height = int(h * image_h)
 
     bbox_converted = (orig_x, orig_y, orig_width, orig_height)
 
     return bbox_converted
 
 
-def convert4cropping(image, bbox):
-    x, y, w, h = convert2relative(bbox)
+# @TODO - cfati: Unused
+def convert4cropping(image, bbox, preproc_h, preproc_w):
+    x, y, w, h = convert2relative(bbox, preproc_h, preproc_w)
 
     image_h, image_w, __ = image.shape
 
-    orig_left    = int((x - w / 2.) * image_w)
-    orig_right   = int((x + w / 2.) * image_w)
-    orig_top     = int((y - h / 2.) * image_h)
-    orig_bottom  = int((y + h / 2.) * image_h)
+    orig_left = int((x - w / 2.) * image_w)
+    orig_right = int((x + w / 2.) * image_w)
+    orig_top = int((y - h / 2.) * image_h)
+    orig_bottom = int((y + h / 2.) * image_h)
 
-    if (orig_left < 0): orig_left = 0
-    if (orig_right > image_w - 1): orig_right = image_w - 1
-    if (orig_top < 0): orig_top = 0
-    if (orig_bottom > image_h - 1): orig_bottom = image_h - 1
+    if orig_left < 0:
+        orig_left = 0
+    if orig_right > image_w - 1:
+        orig_right = image_w - 1
+    if orig_top < 0:
+        orig_top = 0
+    if orig_bottom > image_h - 1:
+        orig_bottom = image_h - 1
 
     bbox_cropping = (orig_left, orig_top, orig_right, orig_bottom)
 
     return bbox_cropping
 
 
-def video_capture(frame_queue, darknet_image_queue):
-    while cap.isOpened():
+def video_capture(stop_flag, input_path, raw_frame_queue, preprocessed_frame_queue, preproc_h, preproc_w):
+    cap = cv2.VideoCapture(input_path)
+    while cap.isOpened() and not stop_flag.is_set():
         ret, frame = cap.read()
         if not ret:
             break
         frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-        frame_resized = cv2.resize(frame_rgb, (darknet_width, darknet_height),
+        frame_resized = cv2.resize(frame_rgb, (preproc_w, preproc_h),
                                    interpolation=cv2.INTER_LINEAR)
-        frame_queue.put(frame)
-        img_for_detect = darknet.make_image(darknet_width, darknet_height, 3)
+        raw_frame_queue.put(frame)
+        img_for_detect = darknet.make_image(preproc_w, preproc_h, 3)
         darknet.copy_image_from_bytes(img_for_detect, frame_resized.tobytes())
-        darknet_image_queue.put(img_for_detect)
+        preprocessed_frame_queue.put(img_for_detect)
+    stop_flag.set()
     cap.release()
 
 
-def inference(darknet_image_queue, detections_queue, fps_queue):
-    while cap.isOpened():
-        darknet_image = darknet_image_queue.get()
+def inference(stop_flag, preprocessed_frame_queue, detections_queue, fps_queue,
+              network, class_names, threshold):
+    while not stop_flag.is_set():
+        darknet_image = preprocessed_frame_queue.get()
         prev_time = time.time()
-        detections = darknet.detect_image(network, class_names, darknet_image, thresh=args.thresh)
+        detections = darknet.detect_image(network, class_names, darknet_image, thresh=threshold)
+        fps = 1 / (time.time() - prev_time)
         detections_queue.put(detections)
-        fps = int(1/(time.time() - prev_time))
-        fps_queue.put(fps)
-        print("FPS: {}".format(fps))
+        fps_queue.put(int(fps))
+        print("FPS: {:.2f}".format(fps))
         darknet.print_detections(detections, args.ext_output)
         darknet.free_image(darknet_image)
-    cap.release()
 
 
-def drawing(frame_queue, detections_queue, fps_queue):
+def drawing(stop_flag, input_video_fps, queues, preproc_h, preproc_w, vid_h, vid_w):
     random.seed(3)  # deterministic bbox colors
-    video = set_saved_video(cap, args.out_filename, (video_width, video_height))
-    while cap.isOpened():
-        frame = frame_queue.get()
+    raw_frame_queue, preprocessed_frame_queue, detections_queue, fps_queue = queues
+    video = set_saved_video(args.out_filename, (vid_w, vid_h), input_video_fps)
+    fps = 1
+    while not stop_flag.is_set():
+        frame = raw_frame_queue.get()
         detections = detections_queue.get()
         fps = fps_queue.get()
         detections_adjusted = []
         if frame is not None:
             for label, confidence, bbox in detections:
-                bbox_adjusted = convert2original(frame, bbox)
+                bbox_adjusted = convert2original(frame, bbox, preproc_h, preproc_w)
                 detections_adjusted.append((str(label), confidence, bbox_adjusted))
             image = darknet.draw_boxes(detections_adjusted, frame, class_colors)
             if not args.dont_show:
-                cv2.imshow('Inference', image)
+                cv2.imshow("Inference", image)
             if args.out_filename is not None:
                 video.write(image)
             if cv2.waitKey(fps) == 27:
                 break
-    cap.release()
+    stop_flag.set()
     video.release()
     cv2.destroyAllWindows()
+    timeout = 1 / (fps if fps > 0 else 0.5)
+    for q in (preprocessed_frame_queue, detections_queue, fps_queue):
+        try:
+            q.get(block=True, timeout=timeout)
+        except queue.Empty:
+            pass
 
 
-if __name__ == '__main__':
-    frame_queue = Queue()
-    darknet_image_queue = Queue(maxsize=1)
-    detections_queue = Queue(maxsize=1)
-    fps_queue = Queue(maxsize=1)
-
+if __name__ == "__main__":
     args = parser()
     check_arguments_errors(args)
     network, class_names, class_colors = darknet.load_network(
-            args.config_file,
-            args.data_file,
-            args.weights,
-            batch_size=1
-        )
+        args.config_file,
+        args.data_file,
+        args.weights,
+        batch_size=1)
     darknet_width = darknet.network_width(network)
     darknet_height = darknet.network_height(network)
     input_path = str2int(args.input)
-    cap = cv2.VideoCapture(input_path)
+    cap = cv2.VideoCapture(input_path)  # Open video twice :(
     video_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
     video_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-    Thread(target=video_capture, args=(frame_queue, darknet_image_queue)).start()
-    Thread(target=inference, args=(darknet_image_queue, detections_queue, fps_queue)).start()
-    Thread(target=drawing, args=(frame_queue, detections_queue, fps_queue)).start()
+    video_fps = int(cap.get(cv2.CAP_PROP_FPS))
+    cap.release()
+    del cap
+
+    ExecUnit = threading.Thread
+    Queue = queue.Queue
+    stop_flag = threading.Event()
+
+    raw_frame_queue = Queue()
+    preprocessed_frame_queue = Queue(maxsize=1)
+    detections_queue = Queue(maxsize=1)
+    fps_queue = Queue(maxsize=1)
+
+    exec_units = (
+        ExecUnit(target=video_capture, args=(stop_flag, input_path, raw_frame_queue, preprocessed_frame_queue,
+                                             darknet_height, darknet_width)),
+        ExecUnit(target=inference, args=(stop_flag, preprocessed_frame_queue, detections_queue, fps_queue,
+                                         network, class_names, args.thresh)),
+        ExecUnit(target=drawing, args=(stop_flag, video_fps,
+                                       (raw_frame_queue, preprocessed_frame_queue, detections_queue, fps_queue),
+                                       darknet_height, darknet_width, video_height, video_width)),
+    )
+    for exec_unit in exec_units:
+        exec_unit.start()
+    for exec_unit in exec_units:
+        exec_unit.join()
+
+    print("\nDone.")
+