- Date: 2020/06/28
- Author: [email protected]
- 初衷: 在口罩人臉識別中,推理實現三種輸入:
1.image; 2.video;3.camera- 參考:
文章目錄
0x00 prepare
- 爲什麼不使用這個 FaceMaskDetection/tensorflow_infer.py 項目呢?
因爲它不提供訓練的代碼,提供視頻和圖片的推理,視頻fps 90~100 之間
- 爲什麼用face-mask-detection-tf2/inference.py 這個項目呢?
因爲它提供了訓練的代碼,提供了圖片和攝像頭的推理
基於PureHing/face-mask-detection-tf2 項目,對於該項目的改動,目前我就改動了inference.py
這個文件,新增了視頻流輸入,提取出圖片輸入代碼爲run_img
函數
This model is a lightweight face mask detection model. Based on ssd,the backbone is Mobilenet and RFB.
- Tensorflow 2.1
- 優點:
- 訓練到推理,一條龍
- 缺點:
- 推理時,只提供了圖片和攝像頭的輸入,沒有視頻的輸入
- 目的:
- 掌握
opencv
的圖片輸入推理和攝像頭輸入推理 - 實現視頻輸入推理並保存
- 掌握
import cv2
import os
import time
import numpy as np
import tensorflow as tf
from absl import flags, app
from absl.flags import FLAGS
from pathlib import Path
import logging
0x01 圖片輸入的推理過程
1.1 logging
輸出日誌 及 判斷是否存在img_path
logging.info(f"[*] image path: {img_path}")
assert os.path.exists(img_path), \
(f"Cannot find image path from {img_path}")
logging.info("[*] Predict {} image... ".format(img_path))
I0628 17:00:31.323117 139714559833920 inference.py:98] [*] image path: assets/test2.jpg
I0628 17:00:31.323309 139714559833920 inference.py:101] [*] Predict assets/test2.jpg image...
1.2 讀取圖片
opencv
圖片讀取是BGR
格式,所以需要轉換一下,轉成RGB
格式
img_raw = cv2.imread(img_path)
# get image size, (572, 950)
img_height_raw, img_width_raw, _ = img_raw.shape
img = cv2.cvtColor(img_raw, cv2.BGR2RGB)
此時,可以輸出img
看一下
cv2.imshow('img_raw', img_raw)
cv2.imshow('img', img[:, :, ::-1])
cv2.waitKey(0)
1.3 padding & normalization
-
cfg[‘steps’] = [8, 16, 32, 64]
-
img: (572, 950) —> (576, 960)
-
pad_params: (mg_h, img_w, img_pad_h, img_pad_w)
(572, 950, 4, 10)
# pad input image to avoid unmatched shape problem
img, pad_params = pad_input_image(img, max_steps=max(cfg['steps']))
# 歸一化, 範圍限制在 [-0.5, 0.5],
img = img / 255.0 - 0.5
def pad_input_image(img, max_steps):
"""pad image to suitable shape"""
img_h, img_w, _ = img.shape
img_pad_h = (max_steps - img_h%max_steps) if img_h % max_steps > 0 \
else 0
img_pad_w = (max_steps - img_w%max_steps) if img_w % max_steps > 0 \
else 0
padd_val = np.mean(img, axis=(0, 1)).astype(np.uint8)
img = cv2.copyMakeBorder(img, 0, img_pad_h, 0, img_pad_w,
cv2.BORDER_CONSTANT, value=padd_val.tolist())
pad_params = (img_h, img_w, img_pad_h, img_pad_w)
return img, pad_params
1.4 get prior boxes
"min_sizes":[[10, 16, 24], [32, 48], [64, 96], [128, 192, 256]],
"steps": [8, 16, 32, 64]
四個特徵圖,每個特徵圖上的每一個像素點對應的有[3, 2, 2, 3]個框,第一個特徵圖上的點有3個框,因此
總共的預選框的數量 = sum(每個特徵圖大小 * 框的數量)
In [21]: def fun(sum, a): ...: for i in range(len(a)): ...: if i == 1 or i == 2: ...: sum = sum + np.ceil(576/a[i])*np.ceil(960/a[i])*2 ...: else: ...: sum = sum + np.ceil(576/a[i])*np.ceil(960/a[i])*3 ...: print(sum) ...: In [22]: a Out[22]: [8, 16, 32, 64] In [23]: fun(0, a) 31725.0
# sum(feature map size[i] * len(min_size[i])) 31725
priors, _ = priors_box(cfg, image_sizes=(img.shape[0], img.shape[1]))
priors = tf.cast(priors, tf.float32)
1.5 inference
增加一個維度,即設置batch=1
# the array based representation of the image will be used later in order to prepare the
# result image with boxes and labels on it.
# Expand dimensions since the model expects images to have shape: [1, None, None, 3]
img_np_expanded = np.expand_dims(img, axis=0)
# shape: [1, anchors number, 3 classes + 4 xyzw]
predictions = model.predict(img_np_expanded)
使用score_threshold
和nms_threshold
篩選出符合條件的boexes
、classes
、scores
# split three parts
boxes, classes, scores = parse_predict(predictions, priors, cfg)
1.6 other
此時已經基本做好了檢測和類別分類,剩下的就是保存和顯示圖片
# recover padding effect
boxes = recover_pad_output(boxes, pad_params)
# draw and save results
save_img_path = os.path.join('assets', 'out_'+os.path.basename(img_path))
for prior_index in range(len(boxes)):
show_image(img_raw, boxes, classes, scores, img_height_raw, img_width_raw, prior_index, cfg['labels_list'])
cv2.imwrite(save_img_path, img_raw)
cv2.imshow('results', img_raw)
cv2.waitKey(0)
- 源碼
def run_img(img_path, cfg, show_result=True):
img_path = Path(img_path)
logging.info(f"[*] image path: {img_path}")
assert img_path.exists(), (f"Cannot find image path from {img_path}")
logging.info("[*] Predict {} image... ".format(img_path))
# read image, default is BGR
img_raw = cv2.imread(str(img_path))
# get image size, (572, 950)
img_height_raw, img_width_raw, _ = img_raw.shape
# convert BGR to RGB
img = cv2.cvtColor(img_raw, cv2.COLOR_BGR2RGB)
# cv2.imshow('img_raw', img_raw)
# cv2.imshow('img', img[:, :, ::-1])
# cv2.waitKey(0)
# pad input image to avoid unmatched shape problem
img, pad_params = pad_input_image(img, max_steps=max(cfg['steps']))
# 歸一化, 範圍限制在 [-0.5, 0.5], (576, 960)
img = img / 255.0 - 0.5
# sum(feature map size[i] * len(min_size[i])) 31725
priors, _ = priors_box(cfg, image_sizes=(img.shape[0], img.shape[1]))
priors = tf.cast(priors, tf.float32)
# the array based representation of the image will be used later in order to prepare the
# result image with boxes and labels on it.
# Expand dimensions since the model expects images to have shape: [1, None, None, 3]
img_np_expanded = np.expand_dims(img, axis=0)
# shape: [1, anchors number, 3 classes + 4 xyzw]
predictions = model.predict(img_np_expanded)
# split three parts
boxes, classes, scores = parse_predict(predictions, priors, cfg)
logging.info(f"scores:{scores}")
# recover padding effect
boxes = recover_pad_output(boxes, pad_params)
# draw and save results
save_img_path = Path('assets') / ('out_'+img_path.name)
for prior_index in range(len(boxes)):
show_image(img_raw, boxes, classes, scores, img_height_raw, img_width_raw, prior_index, cfg['labels_list'])
cv2.imwrite(str(save_img_path), img_raw)
cv2.imshow('results', img_raw)
cv2.waitKey(0)
0x02 視頻流輸入推理
- 在視頻推理的過程中,fps 的值爲11~ 20 之間,參考2中的fps爲90~100之間
究其原因,可能是在於預選框的數量不同。而預選框的數量和特徵圖大小相關。
參考1中的特徵圖的大小和原始尺寸相關,具體的計算方式如下:
imgae_size = [720, 1280] steps = [8, 16, 32, 64] # 四個特徵圖大小 feature_maps = [ [math.ceil(image_sizes[0] / step), math.ceil(image_sizes[1] / step)] for step in steps]
參考2中的特徵圖大小是預先設定的,
# anchor configuration feature_map_sizes = [[33, 33], [17, 17], [9, 9], [5, 5], [3, 3]] anchor_sizes = [[0.04, 0.056], [0.08, 0.11], [0.16, 0.22], [0.32, 0.45], [0.64, 0.72]] anchor_ratios = [[1, 0.62, 0.42]] * 5
所以這大概就是fps之間的區別,還有一個可能的因素是在於骨幹網絡的不同
# 參考1的視頻推理時間: read_frame:0.002267, infer time:0.084887, write time:0.004903 # 參考2的視頻推理時間: read_frame:0.001686, infer time:0.018405, write time:0.009145
- 另一個就是在最後的輸出結果當中,我把score調成了0.9,纔有了想要的效果
2.1 創建視頻讀取對象
# 建立一個讀取視頻對象
cap = cv2.VideoCapture(video_path)
2.2 獲取每一幀的信息
# 讀取視頻的每幀信息 & fps
height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT)
width = cap.get(cv2.CAP_PROP_FRAME_WIDTH)
fps = cap.get(cv2.CAP_PROP_FPS)
2.3 獲取視頻的總幀數
# 獲取視頻總幀數
total_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT)
total_frames = int(total_frames)
assert cap.isOpened(), ("Video open failed.")
2.4 創建一個視頻寫入對象
# 設置儲存視頻的格式
fourcc = cv2.VideoWriter_fourcc(*'XVID')
# 創建一個視頻寫入對象
writer = cv2.VideoWriter(output_video_name, fourcc,
fps, (int(width), int(height)))
2.5 迭代每一幀,對每一幀進行判斷
# 讀取每一幀
ret, img_raw = cap.read()
assert ret, print('No video found')
# 格式轉換
img = cv2.cvtColor(img_raw, cv2.COLOR_BGR2RGB)
# cv2.imshow('img', img[:, :, ::-1])
# key = cv2.waitKey(0)
2.6 other
接下來就和上面的圖片輸入差不多了,下面是源碼。不同的地方在於視頻保存和圖片保存的方式不一樣,另外,我把cfg.score_threshold
設置成了0.9.
def run_on_video(video_path, output_video_name, conf_thresh, cfg, model):
# 建立一個讀取視頻對象
cap = cv2.VideoCapture(video_path)
# 讀取視頻的每幀信息 & fps
height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT)
width = cap.get(cv2.CAP_PROP_FRAME_WIDTH)
fps = cap.get(cv2.CAP_PROP_FPS)
# 設置儲存視頻的格式
fourcc = cv2.VideoWriter_fourcc(*'XVID')
# 創建一個視頻寫入對象
writer = cv2.VideoWriter(output_video_name, fourcc,
fps, (int(width), int(height)))
# 獲取視頻總幀數
total_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT)
total_frames = int(total_frames)
assert cap.isOpened(), ("Video open failed.")
# 獲取先驗框
priors, _ = priors_box(cfg, image_sizes=(int(height), int(width)))
priors = tf.cast(priors, tf.float32)
for idx in range(total_frames):
start_stamp = time.time()
ret, img_raw = cap.read()
assert ret, print('No video found')
img = cv2.cvtColor(img_raw, cv2.COLOR_BGR2RGB)
# reading time
read_frame_stamp = time.time()
# reshape
img = img / 255.0 - 0.5
# 增維度, img shape: [1, None, None, 3]
# predictions shape: [1, anchors number, 3 classes + 4 xyzw]
predictions = model.predict(img[np.newaxis, ...])
boxes, classes, scores = parse_predict(predictions, priors, cfg)
for prior_index in range(len(classes)):
show_image(img_raw, boxes, classes, scores, int(height), int(width),
prior_index, cfg['labels_list'])
fps_str = "FPS: %.2f" % (1 / (time.time() - read_frame_stamp))
cv2.putText(img_raw, fps_str, (25, 25), cv2.FONT_HERSHEY_DUPLEX, 0.75, (0, 255, 0), 2)
# show frame
cv2.imshow('result', img_raw)
cv2.waitKey(1)
# inference time
inference_stamp = time.time()
# write frame
writer.write(img_raw)
write_frame_stamp = time.time()
print("%d of %d" % (idx+1, total_frames))
print("read_frame:%f, infer time:%f, write time:%f" %
(read_frame_stamp - start_stamp,
inference_stamp - read_frame_stamp,
write_frame_stamp - inference_stamp))
cap.release()
writer.release()
0x03 Camera
最後的攝像頭部分,我沒有usb攝像頭,故沒有深入研究,只是將他摘出來,嵌套在main函數中, 最終的效果還是在於產生一個240*320的窗口,實時檢測
def run_camera():
capture = cv2.VideoCapture(0)
capture.set(cv2.CAP_PROP_FRAME_WIDTH, 320)
capture.set(cv2.CAP_PROP_FRAME_HEIGHT, 240)
priors, _ = priors_box(cfg, image_sizes=(240, 320))
priors = tf.cast(priors, tf.float32)
start = time.time()
while True:
_, frame = capture.read()
assert frame, ('No camera found')
h, w, _ = frame.shape
img = np.float32(frame.copy())
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
img = img / 255.0 - 0.5
predictions = model(img[np.newaxis, ...])
boxes, classes, scores = parse_predict(predictions, priors, cfg)
for prior_index in range(len(classes)):
show_image(frame, boxes, classes, scores, h, w, prior_index, cfg['labels_list'])
# calculate fps
fps_str = "FPS: %.2f" % (1 / (time.time() - start))
start = time.time()
cv2.putText(frame, fps_str, (25, 25), cv2.FONT_HERSHEY_DUPLEX, 0.75, (0, 255, 0), 2)
# show frame
cv2.imshow('frame', frame)
if cv2.waitKey(1) == ord('q'):
exit()
0x04 不足
該項目推理時的最精華的部分正是在於prior boxes
的生成,以及nms
篩選機制,曾經在筆試的時候,有考題是讓學生手撕iou
算法,我當時有靈感,但是沒寫全,有點遺憾
以後待補
上述說的即以下兩個函數:
priors, _ = priors_box(cfg, image_sizes=(int(height), int(width)))
boxes, classes, scores = parse_predict(predictions, priors, cfg)