from pynq_dpu import DpuOverlay
from pynq_peripherals import PmodGroveAdapter
overlay = DpuOverlay("dpu.bit")

# Initiate pmod adapter. G4 indicates that the OLED is connected to the G4 slot on the PYNQ Grove Adapter
adapter = PmodGroveAdapter(overlay.PMODA, G4='grove_oled')

# Lets import some libraries we will need later on:
import os                                  # The os library provides some functions for interacting with your operation system
import time                                # The time library provides some time related functions 
import numpy as np                         # numpy or np is a library for processing numerical data.
import cv2                                 # cv2 is a library for image processing which we need later
import random                              # This module help in generating pseudo-random number generators
import colorsys                            # This module helps in creating bidrectional conversions of color values
from matplotlib.patches import Rectangle   # This module is used to plot rectangles
import matplotlib.pyplot as plt            # Used to create a figure/ plot areas in a figure, plot lines in a figure
from IPython.display import display, Image # Imports public APIs for display tools in IPython

# This line enables matplotlib graphs to be included in the notebook, next to the code
%matplotlib inline

overlay.load_model("tf_yolov3_voc.xmodel")

anchor_list = [10,13,16,30,33,23,30,61,62,45,59,119,116,90,156,198,373,326]
anchor_float = [float(x) for x in anchor_list]
anchors = np.array(anchor_float).reshape(-1, 2)

# Get Model Classification Information
def get_class(classes_path):
    with open(classes_path) as f:
        class_names = f.readlines()
    class_names = [c.strip() for c in class_names]
    return class_names
    
classes_path = "img/voc_classes.txt"
class_names = get_class(classes_path)

# Define unique colors for each class
num_classes = len(class_names)
hsv_tuples = [(1.0 * x / num_classes, 1., 1.) for x in range(num_classes)]
colors = list(map(lambda x: colorsys.hsv_to_rgb(*x), hsv_tuples))
colors = list(map(lambda x: 
                  (int(x[0] * 255), int(x[1] * 255), int(x[2] * 255)), 
                  colors))
random.seed(0)
random.shuffle(colors)
random.seed(None)

# This function resizes the image with unchanged aspect ratio using padding.
def letterbox_image(image, size):
    ih, iw, _ = image.shape
    w, h = size
    scale = min(w/iw, h/ih)
    
    nw = int(iw*scale)
    nh = int(ih*scale)

    image = cv2.resize(image, (nw,nh), interpolation=cv2.INTER_LINEAR)
    new_image = np.ones((h,w,3), np.uint8) * 128
    h_start = (h-nh)//2
    w_start = (w-nw)//2
    new_image[h_start:h_start+nh, w_start:w_start+nw, :] = image
    return new_image

# This function performs pre-processing by helping us in converting the image into an array which can be fed for processing.
def pre_process(image, model_image_size):
    image = image[...,::-1]
    image_h, image_w, _ = image.shape
 
    if model_image_size != (None, None):
        assert model_image_size[0]%32 == 0, 'Multiples of 32 required'
        assert model_image_size[1]%32 == 0, 'Multiples of 32 required'
        boxed_image = letterbox_image(image, tuple(reversed(model_image_size)))
    else:
        new_image_size = (image_w - (image_w % 32), image_h - (image_h % 32))
        boxed_image = letterbox_image(image, new_image_size)
    image_data = np.array(boxed_image, dtype='float32')
    image_data /= 255.
    image_data = np.expand_dims(image_data, 0) 
    return image_data

# This function gets information on box position, its size along with confidence and box class probabilities
def _get_feats(feats, anchors, num_classes, input_shape):
    num_anchors = len(anchors)
    anchors_tensor = np.reshape(np.array(anchors, dtype=np.float32), [1, 1, 1, num_anchors, 2])
    grid_size = np.shape(feats)[1:3]
    nu = num_classes + 5
    predictions = np.reshape(feats, [-1, grid_size[0], grid_size[1], num_anchors, nu])
    grid_y = np.tile(np.reshape(np.arange(grid_size[0]), [-1, 1, 1, 1]), [1, grid_size[1], 1, 1])
    grid_x = np.tile(np.reshape(np.arange(grid_size[1]), [1, -1, 1, 1]), [grid_size[0], 1, 1, 1])
    grid = np.concatenate([grid_x, grid_y], axis = -1)
    grid = np.array(grid, dtype=np.float32)

    box_xy = (1/(1+np.exp(-predictions[..., :2])) + grid) / np.array(grid_size[::-1], dtype=np.float32)
    box_wh = np.exp(predictions[..., 2:4]) * anchors_tensor / np.array(input_shape[::-1], dtype=np.float32)
    box_confidence = 1/(1+np.exp(-predictions[..., 4:5]))
    box_class_probs = 1/(1+np.exp(-predictions[..., 5:]))
    return box_xy, box_wh, box_confidence, box_class_probs


# This function is used to correct the bounding box position by scaling it
def correct_boxes(box_xy, box_wh, input_shape, image_shape):
    box_yx = box_xy[..., ::-1]
    box_hw = box_wh[..., ::-1]
    input_shape = np.array(input_shape, dtype = np.float32)
    image_shape = np.array(image_shape, dtype = np.float32)
    new_shape = np.around(image_shape * np.min(input_shape / image_shape))
    offset = (input_shape - new_shape) / 2. / input_shape
    scale = input_shape / new_shape
    box_yx = (box_yx - offset) * scale
    box_hw *= scale

    box_mins = box_yx - (box_hw / 2.)
    box_maxes = box_yx + (box_hw / 2.)
    boxes = np.concatenate([
        box_mins[..., 0:1],
        box_mins[..., 1:2],
        box_maxes[..., 0:1],
        box_maxes[..., 1:2]
    ], axis = -1)
    boxes *= np.concatenate([image_shape, image_shape], axis = -1)
    return boxes

# This function is used to get information on the valid objects detected and their scores
def boxes_and_scores(feats, anchors, classes_num, input_shape, image_shape):
    box_xy, box_wh, box_confidence, box_class_probs = _get_feats(feats, anchors, classes_num, input_shape)
    boxes = correct_boxes(box_xy, box_wh, input_shape, image_shape)
    boxes = np.reshape(boxes, [-1, 4])
    box_scores = box_confidence * box_class_probs
    box_scores = np.reshape(box_scores, [-1, classes_num])
    return boxes, box_scores


# This function suppresses non-maximal boxes by eliminating boxes which are lower than the threshold
def nms_boxes(boxes, scores):
    """Suppress non-maximal boxes.

    # Arguments
        boxes: ndarray, boxes of objects.
        scores: ndarray, scores of objects.

    # Returns
        keep: ndarray, index of effective boxes.
    """
    x1 = boxes[:, 0]
    y1 = boxes[:, 1]
    x2 = boxes[:, 2]
    y2 = boxes[:, 3]

    areas = (x2-x1+1)*(y2-y1+1)
    order = scores.argsort()[::-1]

    keep = []
    while order.size > 0:
        i = order[0]
        keep.append(i)

        xx1 = np.maximum(x1[i], x1[order[1:]])
        yy1 = np.maximum(y1[i], y1[order[1:]])
        xx2 = np.minimum(x2[i], x2[order[1:]])
        yy2 = np.minimum(y2[i], y2[order[1:]])

        w1 = np.maximum(0.0, xx2 - xx1 + 1)
        h1 = np.maximum(0.0, yy2 - yy1 + 1)
        inter = w1 * h1

        ovr = inter / (areas[i] + areas[order[1:]] - inter)
        inds = np.where(ovr <= 0.55)[0]  # threshold
        order = order[inds + 1]

    return keep

# This function gives essential information about the objects detected like bounding box information, score of the object 
# detected and the class associated with it
def evaluate(yolo_outputs, image_shape, class_names, anchors):
    score_thresh = 0.2
    anchor_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
    boxes = []
    box_scores = []
    input_shape = np.shape(yolo_outputs[0])[1 : 3]
    input_shape = np.array(input_shape)*32

    for i in range(len(yolo_outputs)):
        _boxes, _box_scores = boxes_and_scores(
            yolo_outputs[i], anchors[anchor_mask[i]], len(class_names), 
            input_shape, image_shape)
        boxes.append(_boxes)
        box_scores.append(_box_scores)
    boxes = np.concatenate(boxes, axis = 0)
    box_scores = np.concatenate(box_scores, axis = 0)

    mask = box_scores >= score_thresh
    boxes_ = []
    scores_ = []
    classes_ = []
    for c in range(len(class_names)):
        class_boxes_np = boxes[mask[:, c]]
        class_box_scores_np = box_scores[:, c]
        class_box_scores_np = class_box_scores_np[mask[:, c]]
        nms_index_np = nms_boxes(class_boxes_np, class_box_scores_np) 
        class_boxes_np = class_boxes_np[nms_index_np]
        class_box_scores_np = class_box_scores_np[nms_index_np]
        classes_np = np.ones_like(class_box_scores_np, dtype = np.int32) * c
        boxes_.append(class_boxes_np)
        scores_.append(class_box_scores_np)
        classes_.append(classes_np)
    boxes_ = np.concatenate(boxes_, axis = 0)
    scores_ = np.concatenate(scores_, axis = 0)
    classes_ = np.concatenate(classes_, axis = 0)

    return boxes_, scores_, classes_

# This function is used to draw boxes around objects post prediction.
def draw_boxes(image, boxes, scores, classes):
    _, ax = plt.subplots(1)
    ax.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    image_h, image_w, _ = image.shape

    for i, bbox in enumerate(boxes):
        [top, left, bottom, right] = bbox
        width, height = right - left, bottom - top
        center_x, center_y = left + width*0.5, top + height*0.5
        score, class_index = scores[i], classes[i]
        label = '{}: {:.4f}'.format(class_names[class_index], score) 
        color = tuple([color/255 for color in colors[class_index]])
        ax.add_patch(Rectangle((left, top), width, height,
                               edgecolor=color, facecolor='none'))
        ax.annotate(label, (center_x, center_y), color=color, weight='bold', 
                    fontsize=12, ha='center', va='center')
    return ax

# Extract the total number of images ending with "JPEG" format present in the 'img' directory
image_folder = 'img'
original_images = [i for i in os.listdir(image_folder) if i.endswith("JPEG")]
total_images = len(original_images)

# Setup input and output tensors
dpu = overlay.runner
inputTensors = dpu.get_input_tensors()
outputTensors = dpu.get_output_tensors()

shapeIn = tuple(inputTensors[0].dims)

shapeOut0 = (tuple(outputTensors[0].dims)) # (1, 13, 13, 75)
shapeOut1 = (tuple(outputTensors[1].dims)) # (1, 26, 26, 75)
shapeOut2 = (tuple(outputTensors[2].dims)) # (1, 52, 52, 75)

outputSize0 = int(outputTensors[0].get_data_size() / shapeIn[0]) # 12675
outputSize1 = int(outputTensors[1].get_data_size() / shapeIn[0]) # 50700
outputSize2 = int(outputTensors[2].get_data_size() / shapeIn[0]) # 202800

# Setup Buffers
input_data = [np.empty(shapeIn, dtype=np.float32, order="C")]
output_data = [np.empty(shapeOut0, dtype=np.float32, order="C"), 
               np.empty(shapeOut1, dtype=np.float32, order="C"),
               np.empty(shapeOut2, dtype=np.float32, order="C")]
image = input_data[0]

# Function to perform pre-processing, model predictions and decoding output
def run(frame, display=False):
    
    # Pre-processing
    image_size = frame.shape[:2]
    image_data = np.array(pre_process(frame, (416, 416)), dtype=np.float32)
    
    # Fetch data to DPU and trigger it
    image[0,...] = image_data.reshape(shapeIn[1:])
    job_id = dpu.execute_async(input_data, output_data)
    dpu.wait(job_id)
    
    # Retrieve output data
    conv_out0 = np.reshape(output_data[0], shapeOut0)
    conv_out1 = np.reshape(output_data[1], shapeOut1)
    conv_out2 = np.reshape(output_data[2], shapeOut2)
    yolo_outputs = [conv_out0, conv_out1, conv_out2]
    
    # Decode output from YOLOv3
    boxes, scores, classes = evaluate(yolo_outputs, image_size, class_names, anchors)
    
    if display:
        _ = draw_boxes(frame, boxes, scores, classes)
        
    return boxes, scores, classes

# Read an input image in the "img" direectory
input_image = cv2.imread(os.path.join(image_folder, original_images[4]))

# Perform pre-processing, model predictions and decode the output from the image
run(input_image, display=True)

# Start capturing a video 
videoIn = cv2.VideoCapture(0)
videoIn.set(cv2.CAP_PROP_BUFFERSIZE, 1)  # Disable buffering
videoIn.set(cv2.CAP_PROP_FRAME_WIDTH, 640);
videoIn.set(cv2.CAP_PROP_FRAME_HEIGHT, 480);

print("Capture device is open: " + str(videoIn.isOpened()))

# Extract the frame from the video
ret, frame = videoIn.read()

# Perform predictions on the frame
boxes, scores, classes = run(frame, display=True)

# Store information of the detected object which has the highest score. And store the class information of this object
best_score = np.argmax(scores)
class_names[classes[best_score]]

oled = adapter.G4
oled.set_default_config()
oled.set_normal_display()
oled.put_string("Detected") 
oled.set_position(2, 0)
oled.put_string(f"{class_names[classes[best_score]]}")

videoIn.release()    # Releasing the video capture object

cap = cv2.VideoCapture(0)    # Start the video capture
cap.set(cv2.CAP_PROP_BUFFERSIZE, 1)  # Disable buffering

display_handle=display(None, display_id=True)    # This displays the video once the frame is updated

last_object = "None"

# while True:
for i in range(200):
    _, frame = cap.read()
    boxes, scores, classes = run(frame)
    
    if scores.any():

        best_score = np.argmax(scores)

        # Draw bounding box
        y_min,x_min,y_max,x_max = map(int, boxes[best_score])
        frame = cv2.rectangle(frame, (x_min,y_min), (x_max, y_max), color=255)

        # Label
        text = f"{class_names[classes[best_score]]}: {scores[best_score]:.2f}"
        text_size = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 2)[0]
        frame = cv2.putText(frame, text, (x_min, y_min-text_size[1]), cv2.FONT_HERSHEY_SIMPLEX, 1, 255, 1, cv2.LINE_AA)

        _, frame = cv2.imencode('.jpeg', frame)
        display_handle.update(Image(data=frame.tobytes()))
        
        if class_names[classes[best_score]] == last_object:
            pass
        else:
            oled.clear_display()
            oled.put_string(class_names[classes[best_score]])
            last_object = class_names[classes[best_score]]

cap.release()    # Releasing the video capture object

cap = cv2.VideoCapture(0)    # Start the video capture
cap.set(cv2.CAP_PROP_BUFFERSIZE, 1)  # Disable buffering

display_handle=display(None, display_id=True)    # This displays the video once the frame is updated

# Enter Code Here..




        
cap.release()    # Releasing the video capture object

del overlay # Clean up code

Term	Description
DPU	Data Processing Unit
YOLO	This is the object detection algorithm we are using!
Overlay	Overlays are designs for the FPGA
FPGA	Field Programmable Gate Array. This is a special kind of chip that we can reprogram to do many different tasks very efficiently!
Array	Another word for list. Multiple variables or items stored under one variable name.
Tensor	A multidementional array also known as a matrix. (arrays within arrays)

PYNQ_301: OBJECT DETECTION¶

Usefull Term Cheat Sheet:¶

What is YOLO?¶

What is VOC?¶

Hardware Setup¶

1. KRIA KV260 Board

2. PYNQ Grove Adapter

3. OLED Display

4. Webcam

Let's get started¶

1. Prepare the overlay¶

2. Utility functions¶

To understand the output of the model in an easier manner we would want to draw a bounding box around the object which is detected and display a score which represents the probability of thhe detected object belonging to a specific class.¶

We can associate each class with a specific color. The cell below does this for us.¶

Let's define some useful functions which will be used later in the notebook.¶

3. Object Detection in an image using a filesystem¶

The function defined below is the main function which performs pre-processing on a frame, makes model predictions and decode the output.¶

4. Object Detection by Using a Webcam¶

Running the next cell, would result in capturing a frame¶

Example OLED Output for a monitor would look like:¶

5. Real Time Object Detection from webcam¶

END OF NOTEBOOK

CHALLENGES:¶

Modify the code in section 5 above in incremental steps:¶

Hint¶

Bonus Challenge: Display multiple objects detected in a video¶