動手學深度學習(tensorflow)---學習筆記整理(十、計算機視覺篇)

有關公式、基本理論等大量內容摘自《動手學深度學習》(TF2.0版)

這一部分主要是計算機視覺內容,之前說的cnn模型也是和視覺聯繫很大的~

通過cnn的學習,我們瞭解了圖片的結構和圖片分類等內容,計算機視覺還有兩個非常重要的內容,一個是類似在圖片內對目標進行檢測(目標檢測),另一個是生成圖片(遷移學習)。

再說上述內容之前,先了解一下計算機視覺其他的基礎的一些知識。

圖像增廣

常見的方法:翻轉、裁剪和變換顏色

原圖:

翻轉

剪裁

變換顏色

亮度

色調

代碼如下:

import tensorflow as tf
import numpy as np
print(tf.__version__)
from matplotlib import pyplot as plt

img = plt.imread('img/girl.jpg')
print(img.shape)
plt.imshow(img)
plt.show()
#繪圖函數
def show_images(imgs, num_rows, num_cols, scale=2):
    figsize = (num_cols * scale, num_rows * scale)
    _, axes = plt.subplots(num_rows, num_cols, figsize=figsize)
    for i in range(num_rows):
        for j in range(num_cols):
            axes[i][j].imshow(imgs[i * num_cols + j])
            axes[i][j].axes.get_xaxis().set_visible(False)
            axes[i][j].axes.get_yaxis().set_visible(False)
    plt.show()
    return axes
#運行圖像增廣方法aug並展示所有的結果,默認2*4個
def apply(img, aug, num_rows=2, num_cols=4, scale=1.5):
    Y = [aug(img) for _ in range(num_rows * num_cols)]
    show_images(Y, num_rows, num_cols, scale)
#左右翻轉
apply(img, tf.image.random_flip_left_right)
#上下翻轉
apply(img, tf.image.random_flip_up_down)
#隨機剪裁10%-100%,像素縮放到600*600
aug=tf.image.random_crop
num_rows=2
num_cols=4
scale=1.5
#圖像尺寸
crop_size=600
Y = [aug(img, (crop_size, crop_size, 3)) for _ in range(num_rows * num_cols)]
show_images(Y, num_rows, num_cols, scale)
#變換顏色亮度,50%-150%
#tf.image.random_brightness函數實現
aug=tf.image.random_brightness
num_rows=2
num_cols=4
scale=1.5
max_delta=0.5
Y = [aug(img, max_delta) for _ in range(num_rows * num_cols)]
show_images(Y, num_rows, num_cols, scale)
#變換顏色色調
#tf.image.random_hue實現
aug=tf.image.random_hue
num_rows=2
num_cols=4
scale=1.5
max_delta=0.5

Y = [aug(img, max_delta) for _ in range(num_rows * num_cols)]
show_images(Y, num_rows, num_cols, scale)
#可以從4個方面改變圖像的顏色:亮度、對比度、飽和度和色調,還有兩個沒介紹

使用圖像增廣訓練模型

(代碼有點點小問題,後續修改)

import tensorflow as tf
import numpy as np
print(tf.__version__)
from matplotlib import pyplot as plt
#繪圖函數
def show_images(imgs, num_rows, num_cols, scale=2):
    figsize = (num_cols * scale, num_rows * scale)
    _, axes = plt.subplots(num_rows, num_cols, figsize=figsize)
    for i in range(num_rows):
        for j in range(num_cols):
            axes[i][j].imshow(imgs[i * num_cols + j])
            axes[i][j].axes.get_xaxis().set_visible(False)
            axes[i][j].axes.get_yaxis().set_visible(False)
    plt.show()
    return axes
#獲取數據集合
(x, y), (test_x, test_y) = tf.keras.datasets.cifar10.load_data()
print(x.shape, test_x.shape)
#繪製前8個
show_images(x[0:8][0], 2, 4, scale=0.8)
#定義殘差神經網絡
from tensorflow.keras import layers,activations
class Residual(tf.keras.Model):
    def __init__(self, num_channels, use_1x1conv=False, strides=1, **kwargs):
        super(Residual, self).__init__(**kwargs)
        self.conv1 = layers.Conv2D(num_channels,
                                   padding='same',
                                   kernel_size=3,
                                   strides=strides)
        self.conv2 = layers.Conv2D(num_channels, kernel_size=3,padding='same')
        if use_1x1conv:
            self.conv3 = layers.Conv2D(num_channels,
                                       kernel_size=1,
                                       strides=strides)
        else:
            self.conv3 = None
        self.bn1 = layers.BatchNormalization()
        self.bn2 = layers.BatchNormalization()

    def call(self, X):
        Y = activations.relu(self.bn1(self.conv1(X)))
        Y = self.bn2(self.conv2(Y))
        if self.conv3:
            X = self.conv3(X)
        return activations.relu(Y + X)

class ResnetBlock(tf.keras.layers.Layer):
    def __init__(self,num_channels, num_residuals, first_block=False,**kwargs):
        super(ResnetBlock, self).__init__(**kwargs)
        self.listLayers=[]
        for i in range(num_residuals):
            if i == 0 and not first_block:
                self.listLayers.append(Residual(num_channels, use_1x1conv=True, strides=2))
            else:
                self.listLayers.append(Residual(num_channels))

    def call(self, X):
        for layer in self.listLayers.layers:
            X = layer(X)
        return X

class ResNet(tf.keras.Model):
    def __init__(self,num_blocks,**kwargs):
        super(ResNet, self).__init__(**kwargs)
        self.conv=tf.keras.layers.Conv2D(64, kernel_size=7, strides=2, padding='same')
        self.bn=tf.keras.layers.BatchNormalization()
        self.relu=tf.keras.layers.Activation('relu')
        self.mp=tf.keras.layers.MaxPool2D(pool_size=3, strides=2, padding='same')
        self.resnet_block1=ResnetBlock(64,num_blocks[0], first_block=True)
        self.resnet_block2=ResnetBlock(128,num_blocks[1])
        self.resnet_block3=ResnetBlock(256,num_blocks[2])
        self.resnet_block4=ResnetBlock(512,num_blocks[3])
        self.gap=tf.keras.layers.GlobalAvgPool2D()
        self.fc=tf.keras.layers.Dense(units=10,activation=tf.keras.activations.softmax)

    def call(self, x):
        x=self.conv(x)
        x=self.bn(x)
        x=self.relu(x)
        x=self.mp(x)
        x=self.resnet_block1(x)
        x=self.resnet_block2(x)
        x=self.resnet_block3(x)
        x=self.resnet_block4(x)
        x=self.gap(x)
        x=self.fc(x)
        return x

net = ResNet([2,2,2,2])
#使用隨機左右翻轉的圖像增廣來訓練模型
print(type(x))
x = np.array([tf.image.random_flip_left_right(i) for i in x])
print(type(x))
net.compile(loss='sparse_categorical_crossentropy',
              optimizer=tf.keras.optimizers.Adam(),
              metrics=['accuracy'])

history = net.fit(x, y,
                    batch_size=64,
                    epochs=2,
                    validation_split=0.2)
test_scores = net.evaluate(test_x, test_y, verbose=2)

微調

說白了就是加載別人已經訓練好的,然後進行修改。原理上來說,就是別人訓練好的模型可以分20個類別,你僅需要預測其中的幾個類別,但是數據集不太一樣,但是別人的模型已經具備識別能力了,在自己的模型上進行訓練(微調即可)。

熱狗識別微調代碼

import tensorflow as tf
import numpy as np
import os
import zipfile
import wget
#使用的熱狗數據集是從網上抓取的,它含有1400張包含熱狗的正類圖像,和同樣多包含其他食品的負類圖像。
#各類的1000張圖像被用於訓練,其餘則用於測試。其實就是熱狗類和其他類
#我們首先將壓縮後的數據集下載到路徑../data之下,然後在該路徑將下載好的數據集解壓,得到兩個文件夾hotdog/train和hotdog/test。
# 這兩個文件夾下面均有hotdog和not-hotdog兩個類別文件夾,每個類別文件夾裏面是圖像文件
def download_data():
    data = os.getcwd()+'/data'
    #下載壓縮文件
    if not os.path.exists(data+'/hotdog.zip'):
        base_url = 'https://apache-mxnet.s3-accelerate.amazonaws.com/'
        wget.download(
            base_url + 'gluon/dataset/hotdog.zip',
            data)
        print("已存在")
    #解壓文件
    with zipfile.ZipFile(data+'/hotdog.zip', 'r') as z:
        z.extractall(os.getcwd())
download_data()

import pathlib
#解壓後的文件路徑
train_dir = 'hotdog/train'
test_dir = 'hotdog/test'
#加載文件
train_dir = pathlib.Path(train_dir)
# train_count = len(list(train_dir.glob('*/*.jpg')))
test_dir = pathlib.Path(test_dir)
# test_count = len(list(test_dir.glob('*/*.jpg')))
#獲取所有便籤種類
CLASS_NAMES = np.array([item.name for item in train_dir.glob('*') if item.name != 'LICENSE.txt' and item.name[0] != '.'])
#總共兩類
print(len(CLASS_NAMES))
print(CLASS_NAMES)
#ImageDataGenerator是tf用於圖像增廣的庫,下屬操作將訓練集和測試集歸一化且調整爲224*224的圖片
image_generator = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1./255)
BATCH_SIZE = 32
IMG_HEIGHT = 224
IMG_WIDTH = 224

train_data_gen = image_generator.flow_from_directory(directory=str(train_dir),
                                                    batch_size=BATCH_SIZE,
                                                    target_size=(IMG_HEIGHT, IMG_WIDTH),
                                                    shuffle=True,
                                                    classes = list(CLASS_NAMES))

test_data_gen = image_generator.flow_from_directory(directory=str(test_dir),
                                                    batch_size=BATCH_SIZE,
                                                    target_size=(IMG_HEIGHT, IMG_WIDTH),
                                                    shuffle=True,
                                                    classes = list(CLASS_NAMES))
import matplotlib.pyplot as plt

def show_batch(image_batch, label_batch):
    plt.figure(figsize=(10,10))
    for n in range(15):
        ax = plt.subplot(5,5,n+1)
        plt.imshow(image_batch[n])
        plt.title(CLASS_NAMES[label_batch[n]==1][0].title())
        plt.axis('off')
    plt.show()
#train_data_gen是image_generator結構,如果要提取圖片需要用next函數,返回一下
image_batch, label_batch = next(train_data_gen)
#32*224*224*3
print(image_batch.shape)
show_batch(image_batch, label_batch)
#使用在ImageNet數據集上預訓練的ResNet-50作爲源模型。
# 這裏指定weights='imagenet'來自動下載並加載預訓練的模型參數。
#一個神經網絡包括兩部分,一部分是features,另一部分是classifier,後者是全連接層
#僅僅加載features
ResNet50 = tf.keras.applications.resnet_v2.ResNet50V2(weights='imagenet', input_shape=(224,224,3))
#也可以只是用結構
#ResNet50 = tf.keras.applications.resnet_v2.ResNet50V2(input_shape=(224,224,3))
for layer in ResNet50.layers:
    layer.trainable = False
net = tf.keras.models.Sequential()
net.add(ResNet50)
net.add(tf.keras.layers.Flatten())
net.add(tf.keras.layers.Dense(2, activation='softmax'))
net.compile(optimizer='adam',
            loss='categorical_crossentropy',
            metrics=['accuracy'])

history = net.fit_generator(
                    train_data_gen,
                    steps_per_epoch=10,
                    epochs=3,
                    validation_data=test_data_gen,
                    validation_steps=10
                    )

沒有初始化參數的模型收斂速度更慢。微調的模型因爲參數初始值更好,往往在相同迭代週期下取得更高的精度。

小結:

  • 遷移學習將從源數據集學到的知識遷移到目標數據集上。微調是遷移學習的一種常用技術。
  • 目標模型複製了源模型上除了輸出層外的所有模型設計及其參數,並基於目標數據集微調這些參數。而目標模型的輸出層需要從頭訓練。
  • 一般來說,微調參數會使用較小的學習率,而從頭訓練輸出層可以使用較大的學習率。

目標檢測和邊界框

邊界框

代碼實現(只是人爲的弄個框,並不是自動檢測出來的):

import tensorflow as tf
import os
print(tf.__version__)
import matplotlib.pyplot as plt

img = plt.imread(os.getcwd()+'/img/girl2.jpg')
plt.imshow(img)
# bbox是bounding box的縮寫
dog_bbox, cat_bbox = [20, 235, 300, 790], [520, 100, 750, 790]
#繪製函數
def bbox_to_rect(bbox, color):
    # 將邊界框(左上x, 左上y, 右下x, 右下y)格式轉換成matplotlib格式:
    # ((左上x, 左上y), 寬, 高)
    return plt.Rectangle(
        xy=(bbox[0], bbox[1]), width=bbox[2]-bbox[0], height=bbox[3]-bbox[1],
        fill=False, edgecolor=color, linewidth=2)
#進行繪製
fig = plt.imshow(img)
fig.axes.add_patch(bbox_to_rect(dog_bbox, 'blue'))
fig.axes.add_patch(bbox_to_rect(cat_bbox, 'red'))
plt.show()

錨框

說白了就是以某個像素爲中心可以生成特別多的邊界框,這些邊界框就叫做錨框。

代碼如下:

import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt
import math
import os
img_raw = tf.io.read_file(os.getcwd()+'/img/girl2.jpg')
print(type(img_raw))
#轉numpy
img = tf.image.decode_jpeg(img_raw).numpy()
print(type(img))
h, w = img.shape[0:2]
print(h, w)
#定義生成所有錨框的函數。對於整個輸入圖像,我們將一共生成 wh(n+m−1) 個錨框
def MultiBoxPrior(feature_map, sizes=[0.75, 0.5, 0.25], ratios=[1, 2, 0.5]):
    """
    # 按照「9.4.1. 生成多個錨框」所講的實現, anchor表示成(xmin, ymin, xmax, ymax).
    https://zh.d2l.ai/chapter_computer-vision/anchor.html
    Args:
        feature_map: torch tensor, Shape: [N, C, H, W].
        sizes: List of sizes (0~1) of generated MultiBoxPriores.
        ratios: List of aspect ratios (non-negative) of generated MultiBoxPriores.
    Returns:
        anchors of shape (1, num_anchors, 4). 由於batch裏每個都一樣, 所以第一維爲1
    """
    pairs = [] # pair of (size, sqrt(ratio))
    for r in ratios:
        pairs.append([sizes[0], np.sqrt(r)])
    for s in sizes[1:]:
        pairs.append([s, np.sqrt(ratios[0])])

    pairs = np.array(pairs)

    ss1 = pairs[:, 0] * pairs[:, 1] # size * sqrt(ration)
    ss2 = pairs[:, 0] / pairs[:, 1] # size / sqrt(retion)

    base_anchors = tf.stack([-ss1, -ss2, ss1, ss2], axis=1) / 2

    h, w = feature_map.shape[-2:]
    shifts_x = tf.divide(tf.range(0, w), w)
    shifts_y = tf.divide(tf.range(0, h), h)
    shift_x, shift_y = tf.meshgrid(shifts_x, shifts_y)
    shift_x = tf.reshape(shift_x, (-1,))
    shift_y = tf.reshape(shift_y, (-1,))
    shifts = tf.stack((shift_x, shift_y, shift_x, shift_y), axis=1)

    anchors = tf.add(tf.reshape(shifts, (-1,1,4)), tf.reshape(base_anchors, (1,-1,4)))
    return tf.cast(tf.reshape(anchors, (1,-1,4)), tf.float32)

x = tf.zeros((1,3,h,w))
y = MultiBoxPrior(x)
print(y.shape)
#3064000=800*766*5,5=len(sizes)+len(ratios)-1
#訪問以(250,250)爲中心的第一個錨框。
#它有4個元素,分別是錨框左上角的xx和yy軸座標和右下角的xx和yy軸座標,其中xx和yy軸的座標值分別已除以圖像的寬和高,因此值域均爲0和1之間。
boxes = tf.reshape(y, (h,w,5,4))
print(boxes[250,250,0,:])

#描繪圖像中以某個像素爲中心的所有錨框的函數
def bbox_to_rect(bbox, color):
    # 將邊界框(左上x, 左上y, 右下x, 右下y)格式轉換成matplotlib格式:
    # ((左上x, 左上y), 寬, 高)
    return plt.Rectangle(
        xy=(bbox[0], bbox[1]), width=bbox[2]-bbox[0], height=bbox[3]-bbox[1],
        fill=False, edgecolor=color, linewidth=2)
def show_bboxes(axes, bboxes, labels=None, colors=None):
    def _make_list(obj, default_values=None):
        if obj is None:
            obj = default_values
        elif not isinstance(obj, (list, tuple)):
            obj = [obj]
        return obj

    labels = _make_list(labels)
    colors = _make_list(colors, ['b', 'g', 'r', 'm', 'c'])
    for i, bbox in enumerate(bboxes):
        color = colors[i % len(colors)]
        rect = bbox_to_rect(bbox.numpy(), color)
        axes.add_patch(rect)
        if labels and len(labels) > i:
            text_color = 'k' if color == 'w' else 'w'
            axes.text(rect.xy[0], rect.xy[1], labels[i],
                va='center', ha='center', fontsize=6,
                color=text_color, bbox=dict(facecolor=color, lw=0))
    plt.show()
from IPython import display
def use_svg_display():
    """Use svg format to display plot in jupyter"""
    display.set_matplotlib_formats('svg')

use_svg_display()
# 設置圖的尺寸
plt.rcParams['figure.figsize'] = (3.5, 2.5)


fig = plt.imshow(img)
bbox_scale = tf.constant([[w,h,w,h]], dtype=tf.float32)
show_bboxes(fig.axes, tf.multiply(boxes[200,250,:,:], bbox_scale),
    ['s=0.75, r=1', 's=0.75, r=2', 's=0.55, r=0.5',
     's=0.5, r=1', 's=0.25, r=1'])

交併比

標註訓練集的錨框

真實框:

生成框:

抑制後的框:

具體過程就不詳細介紹了,直接看代碼研究研究吧(其實我也不是很懂,大概光會用)

import tensorflow as tf
import matplotlib.pyplot as plt
import os
import numpy as np
#描繪圖像中以某個像素爲中心的所有錨框的函數
def bbox_to_rect(bbox, color):
    # 將邊界框(左上x, 左上y, 右下x, 右下y)格式轉換成matplotlib格式:
    # ((左上x, 左上y), 寬, 高)
    return plt.Rectangle(
        xy=(bbox[0], bbox[1]), width=bbox[2]-bbox[0], height=bbox[3]-bbox[1],
        fill=False, edgecolor=color, linewidth=2)
def show_bboxes(axes, bboxes, labels=None, colors=None):
    def _make_list(obj, default_values=None):
        if obj is None:
            obj = default_values
        elif not isinstance(obj, (list, tuple)):
            obj = [obj]
        return obj

    labels = _make_list(labels)
    colors = _make_list(colors, ['b', 'g', 'r', 'm', 'c'])
    for i, bbox in enumerate(bboxes):
        color = colors[i % len(colors)]
        rect = bbox_to_rect(bbox.numpy(), color)
        axes.add_patch(rect)
        if labels and len(labels) > i:
            text_color = 'k' if color == 'w' else 'w'
            axes.text(rect.xy[0], rect.xy[1], labels[i],
                va='center', ha='center', fontsize=6,
                color=text_color, bbox=dict(facecolor=color, lw=0))
    plt.show()

set_1 = [[1,2,3,4],[5,6,7,8]]
set_2 = [[1,1,1,1],[2,2,2,2]]
lower_bounds = tf.maximum(tf.expand_dims(set_1, axis=1), tf.expand_dims(set_2, axis=0)) # (n1, n2, 2)
upper_bounds = tf.minimum(tf.expand_dims(set_1, axis=1), tf.expand_dims(set_2, axis=0)) # (n1, n2, 2)

print(tf.expand_dims(set_1, axis=1), tf.expand_dims(set_2, axis=0), lower_bounds, tf.multiply(set_1, set_2), tf.subtract(set_1, set_2))
#交併比
# 參考https://github.com/sgrvinod/a-PyTorch-Tutorial-to-Object-Detection/blob/master/utils.py#L356
def compute_intersection(set_1, set_2):
    """
    計算anchor之間的交集
    Args:
        set_1: a tensor of dimensions (n1, 4), anchor表示成(xmin, ymin, xmax, ymax)
        set_2: a tensor of dimensions (n2, 4), anchor表示成(xmin, ymin, xmax, ymax)
    Returns:
        intersection of each of the boxes in set 1 with respect to each of the boxes in set 2, shape: (n1, n2)
    """
    # tensorflow auto-broadcasts singleton dimensions
    lower_bounds = tf.maximum(tf.expand_dims(set_1[:,:2], axis=1), tf.expand_dims(set_2[:,:2], axis=0)) # (n1, n2, 2)
    upper_bounds = tf.minimum(tf.expand_dims(set_1[:,2:], axis=1), tf.expand_dims(set_2[:,2:], axis=0)) # (n1, n2, 2)
    # 設置最小值
    intersection_dims = tf.clip_by_value(upper_bounds - lower_bounds, clip_value_min=0, clip_value_max=3) # (n1, n2, 2)
    return tf.multiply(intersection_dims[:, :, 0], intersection_dims[:, :, 1]) # (n1, n2)

def compute_jaccard(set_1, set_2):
    """
    計算anchor之間的Jaccard係數(IoU)
    Args:
        set_1: a tensor of dimensions (n1, 4), anchor表示成(xmin, ymin, xmax, ymax)
        set_2: a tensor of dimensions (n2, 4), anchor表示成(xmin, ymin, xmax, ymax)
    Returns:
        Jaccard Overlap of each of the boxes in set 1 with respect to each of the boxes in set 2, shape: (n1, n2)
    """
    # Find intersections
    intersection = compute_intersection(set_1, set_2)

    # Find areas of each box in both sets
    areas_set_1 = tf.multiply(tf.subtract(set_1[:, 2], set_1[:, 0]), tf.subtract(set_1[:, 3], set_1[:, 1]))  # (n1)
    areas_set_2 = tf.multiply(tf.subtract(set_2[:, 2], set_2[:, 0]), tf.subtract(set_2[:, 3], set_2[:, 1]))  # (n2)

    # Find the union
    union = tf.add(tf.expand_dims(areas_set_1, axis=1), tf.expand_dims(areas_set_2, axis=0))  # (n1, n2)
    union = tf.subtract(union, intersection)  # (n1, n2)

    return tf.divide(intersection, union) #(n1, n2)
def MultiBoxPrior(feature_map, sizes=[0.75, 0.5, 0.25], ratios=[1, 2, 0.5]):
    """
    # 按照「9.4.1. 生成多個錨框」所講的實現, anchor表示成(xmin, ymin, xmax, ymax).
    https://zh.d2l.ai/chapter_computer-vision/anchor.html
    Args:
        feature_map: torch tensor, Shape: [N, C, H, W].
        sizes: List of sizes (0~1) of generated MultiBoxPriores.
        ratios: List of aspect ratios (non-negative) of generated MultiBoxPriores.
    Returns:
        anchors of shape (1, num_anchors, 4). 由於batch裏每個都一樣, 所以第一維爲1
    """
    pairs = [] # pair of (size, sqrt(ratio))
    for r in ratios:
        pairs.append([sizes[0], np.sqrt(r)])
    for s in sizes[1:]:
        pairs.append([s, np.sqrt(ratios[0])])

    pairs = np.array(pairs)

    ss1 = pairs[:, 0] * pairs[:, 1] # size * sqrt(ration)
    ss2 = pairs[:, 0] / pairs[:, 1] # size / sqrt(retion)

    base_anchors = tf.stack([-ss1, -ss2, ss1, ss2], axis=1) / 2

    h, w = feature_map.shape[-2:]
    shifts_x = tf.divide(tf.range(0, w), w)
    shifts_y = tf.divide(tf.range(0, h), h)
    shift_x, shift_y = tf.meshgrid(shifts_x, shifts_y)
    shift_x = tf.reshape(shift_x, (-1,))
    shift_y = tf.reshape(shift_y, (-1,))
    shifts = tf.stack((shift_x, shift_y, shift_x, shift_y), axis=1)

    anchors = tf.add(tf.reshape(shifts, (-1,1,4)), tf.reshape(base_anchors, (1,-1,4)))
    return tf.cast(tf.reshape(anchors, (1,-1,4)), tf.float32)

img_raw = tf.io.read_file(os.getcwd()+'/img/girl2.jpg')
img = tf.image.decode_jpeg(img_raw).numpy()
h, w = img.shape[0:2]
x = tf.zeros((1,3,h,w))
y = MultiBoxPrior(x)
print(y.shape)
boxes = tf.reshape(y, (h,w,5,4))
print(tf.expand_dims(boxes[200,250,:,:][:, :2], axis=1), tf.expand_dims(boxes[210,260,1:2,:][:, :2], axis=0)
)
print(tf.maximum(tf.expand_dims(boxes[200,250,:,:][:, :2], axis=1), tf.expand_dims(boxes[210,260,1:2,:][:, :2], axis=0))
)
#使用交併比來衡量錨框與真實邊界框以及錨框與錨框之間的相似度
bbox_scale = tf.constant([[w,h,w,h]], dtype=tf.float32)
#真實框
ground_truth = tf.constant([[0, 0, 0.3, 0.39, 0.99],
                [1, 0.57, 0.15, 0.99, 1]])
#生成的錨框
anchors = tf.constant([[0, 0.1, 0.2, 0.3],
            [0.15, 0.2, 0.4, 0.4],
            [0.63, 0.05, 0.88, 0.98],
            [0.66, 0.45, 0.8, 0.8],
            [0.57, 0.3,  0.92, 0.9]])

fig = plt.imshow(img)
show_bboxes(fig.axes, tf.multiply(ground_truth[:, 1:], bbox_scale),
        ['girl', 'boy'], 'k')
show_bboxes(fig.axes, tf.multiply(anchors, bbox_scale),
        ['0', '1', '2', '3', '4'])


def assign_anchor(bb, anchor, jaccard_threshold=0.5):
    """
    # 按照「9.4.1. 生成多個錨框」圖9.3所講爲每個anchor分配真實的bb, anchor表示成歸一化(xmin, ymin, xmax, ymax).
    https://zh.d2l.ai/chapter_computer-vision/anchor.html
    Args:
        bb: 真實邊界框(bounding box), shape:(nb, 4)
        anchor: 待分配的anchor, shape:(na, 4)
        jaccard_threshold: 預先設定的閾值
    Returns:
        assigned_idx: shape: (na, ), 每個anchor分配的真實bb對應的索引, 若未分配任何bb則爲-1
    """
    na = anchor.shape[0]
    nb = bb.shape[0]
    jaccard = compute_jaccard(anchor, bb).numpy()   # shape: (na, nb)
    assigned_idx = np.ones(na) * -1 # 初始全爲-1

    # 先爲每個bb分配一個anchor(不要求滿足jaccard_threshold)
    jaccard_cp = jaccard.copy()
    for j in range(nb):
        i = np.argmax(jaccard_cp[:, j])
        assigned_idx[i] = j
        jaccard_cp[i, :] = float("-inf")    # 賦值爲負無窮, 相當於去掉這一行

    # 處理還未被分配的anchor, 要求滿足jaccard_threshold
    for i in range(na):
        if assigned_idx[i] == -1:
            j = np.argmax(jaccard[i, :])
            if jaccard[i, j] >= jaccard_threshold:
                assigned_idx[i] = j
    return tf.cast(assigned_idx, tf.int32)

def xy_to_cxcy(xy):
    """
    將(x_min, y_min, x_max, y_max)形式的anchor轉換成(center_x, center_y, w, h)形式的.
    https://github.com/sgrvinod/a-PyTorch-Tutorial-to-Object-Detection/blob/master/utils.py
    Args:
        xy: bounding boxes in boundary coordinates, a tensor of size (n_boxes, 4)
    Returns:
        bounding boxes in center-size coordinates, a tensor of size (n_boxes, 4)
    """
    return tf.concat(((xy[:, 2:] + xy[:, :2]) / 2,  #c_x, c_y
              xy[:, 2:] - xy[:, :2]), axis=1)



def MultiBoxTarget(anchor, label):
    """
    # 按照「9.4.1. 生成多個錨框」所講的實現, anchor表示成歸一化(xmin, ymin, xmax, ymax).
    https://zh.d2l.ai/chapter_computer-vision/anchor.html
    Args:
        anchor: torch tensor, 輸入的錨框, 一般是通過MultiBoxPrior生成, shape:(1,錨框總數,4)
        label: 真實標籤, shape爲(bn, 每張圖片最多的真實錨框數, 5)
               第二維中,如果給定圖片沒有這麼多錨框, 可以先用-1填充空白, 最後一維中的元素爲[類別標籤, 四個座標值]
    Returns:
        列表, [bbox_offset, bbox_mask, cls_labels]
        bbox_offset: 每個錨框的標註偏移量,形狀爲(bn,錨框總數*4)
        bbox_mask: 形狀同bbox_offset, 每個錨框的掩碼, 一一對應上面的偏移量, 負類錨框(背景)對應的掩碼均爲0, 正類錨框的掩碼均爲1
        cls_labels: 每個錨框的標註類別, 其中0表示爲背景, 形狀爲(bn,錨框總數)
    """
    assert len(anchor.shape) == 3 and len(label.shape) == 3
    bn = label.shape[0]

    def MultiBoxTarget_one(anchor, label, eps=1e-6):
        """
        MultiBoxTarget函數的輔助函數, 處理batch中的一個
        Args:
            anchor: shape of (錨框總數, 4)
            label: shape of (真實錨框數, 5), 5代表[類別標籤, 四個座標值]
            eps: 一個極小值, 防止log0
        Returns:
            offset: (錨框總數*4, )
            bbox_mask: (錨框總數*4, ), 0代表背景, 1代表非背景
            cls_labels: (錨框總數, 4), 0代表背景
        """
        an = anchor.shape[0]
        assigned_idx = assign_anchor(label[:, 1:], anchor) ## (錨框總數, )
        # 決定anchor留下或者捨去
        bbox_mask = tf.repeat(tf.expand_dims(tf.cast((assigned_idx >= 0), dtype=tf.double), axis=-1), repeats=4, axis=1)

        cls_labels = np.zeros(an, dtype=int) # 0表示背景
        assigned_bb = np.zeros((an, 4), dtype=float) # 所有anchor對應的bb座標
        for i in range(an):
            bb_idx = assigned_idx[i]
            if bb_idx >= 0: # 即非背景
                cls_labels[i] = label.numpy()[bb_idx, 0] + 1 # 要注意加1
                assigned_bb[i, :] = label.numpy()[bb_idx, 1:]

        center_anchor = tf.cast(xy_to_cxcy(anchor), dtype=tf.double)  # (center_x, center_y, w, h)
        center_assigned_bb = tf.cast(xy_to_cxcy(assigned_bb), dtype=tf.double) # (center_x, center_y, w, h)

        offset_xy = 10.0 * (center_assigned_bb[:,:2] - center_anchor[:,:2]) / center_anchor[:,2:]
        offset_wh = 5.0 * tf.math.log(eps + center_assigned_bb[:, 2:] / center_anchor[:, 2:])
        offset = tf.multiply(tf.concat((offset_xy, offset_wh), axis=1), bbox_mask)    # (錨框總數, 4)

        return tf.reshape(offset, (-1,)), tf.reshape(bbox_mask, (-1,)), cls_labels

    batch_offset = []
    batch_mask = []
    batch_cls_labels = []
    for b in range(bn):
        offset, bbox_mask, cls_labels = MultiBoxTarget_one(anchor[0, :, :], label[b,:,:])

        batch_offset.append(offset)
        batch_mask.append(bbox_mask)
        batch_cls_labels.append(cls_labels)

    batch_offset = tf.convert_to_tensor(batch_offset)
    batch_mask = tf.convert_to_tensor(batch_mask)
    batch_cls_labels = tf.convert_to_tensor(batch_cls_labels)

    return [batch_offset, batch_mask, batch_cls_labels]
labels = MultiBoxTarget(tf.expand_dims(anchors, axis=0),tf.expand_dims(ground_truth, axis=0))
print(labels[2],labels[1],labels[0])


anchors = tf.convert_to_tensor([[0.1, 0.28, 0.38, 0.99],
                [0.08, 0.2, 0.56, 0.95],
                [0.15, 0.3, 0.62, 0.91],
                [0.7, 0.2, 0.98, 0.96]])
offset_preds = tf.convert_to_tensor([0.0] * (4 * len(anchors)))
cls_probs = tf.convert_to_tensor([[0., 0., 0., 0.], # 背景的預測概率
                [0.9, 0.6, 0.4, 0.1],    # 女孩的預測概率
                [0.1, 0.2, 0.3, 0.9]])   # 男孩的預測概率
print(anchors, offset_preds, cls_probs)
fig = plt.imshow(img)
show_bboxes(fig.axes, anchors * bbox_scale,
        ['girl=0.9', 'girl=0.6', 'girl=0.4', 'boy=0.9'])
#非極大值抑制
from collections import namedtuple
Pred_BB_Info = namedtuple("Pred_BB_Info",
        ["index", "class_id", "confidence", "xyxy"])

def non_max_suppression(bb_info_list, nms_threshold=0.5):
    """
    非極大抑制處理預測的邊界框
    Args:
        bb_info_list: Pred_BB_Info的列表, 包含預測類別、置信度等信息
        nms_threshold: 閾值
    Returns:
        output: Pred_BB_Info的列表, 只保留過濾後的邊界框信息
    """
    output = []
    # 現根據置信度從高到底排序
    sorted_bb_info_list = sorted(bb_info_list,
                    key = lambda x: x.confidence,
                    reverse=True)
    while len(sorted_bb_info_list) != 0:
        best = sorted_bb_info_list.pop(0)
        output.append(best)

        if len(sorted_bb_info_list) == 0:
            break
        bb_xyxy = []
        for bb in sorted_bb_info_list:
            bb_xyxy.append(bb.xyxy)

        iou = compute_jaccard(tf.convert_to_tensor(best.xyxy),
                    tf.squeeze(tf.convert_to_tensor(bb_xyxy), axis=1))[0] # shape: (len(sorted_bb_info_list), )
        n = len(sorted_bb_info_list)
        sorted_bb_info_list = [
                    sorted_bb_info_list[i] for i in
                    range(n) if iou[i] <= nms_threshold]
    return output
def MultiBoxDetection(cls_prob, loc_pred, anchor, nms_threshold=0.5):
    """
    # 按照「9.4.1. 生成多個錨框」所講的實現, anchor表示成歸一化(xmin, ymin, xmax, ymax).
    https://zh.d2l.ai/chapter_computer-vision/anchor.html
    Args:
        cls_prob: 經過softmax後得到的各個錨框的預測概率, shape:(bn, 預測總類別數+1, 錨框個數)
        loc_pred: 預測的各個錨框的偏移量, shape:(bn, 錨框個數*4)
        anchor: MultiBoxPrior輸出的默認錨框, shape: (1, 錨框個數, 4)
        nms_threshold: 非極大抑制中的閾值
    Returns:
        所有錨框的信息, shape: (bn, 錨框個數, 6)
        每個錨框信息由[class_id, confidence, xmin, ymin, xmax, ymax]表示
        class_id=-1 表示背景或在非極大值抑制中被移除了
    """
    assert len(cls_prob.shape) == 3 and len(loc_pred.shape) == 2 and len(anchor.shape) == 3
    bn = cls_prob.shape[0]

    def MultiBoxDetection_one(c_p, l_p, anc, nms_threshold=0.5):
        """
        MultiBoxDetection的輔助函數, 處理batch中的一個
        Args:
            c_p: (預測總類別數+1, 錨框個數)
            l_p: (錨框個數*4, )
            anc: (錨框個數, 4)
            nms_threshold: 非極大抑制中的閾值
        Return:
            output: (錨框個數, 6)
        """
        pred_bb_num = c_p.shape[1]
        # 加上偏移量
        anc = tf.add(anc, tf.reshape(l_p, (pred_bb_num, 4))).numpy()

        # 最大的概率
        confidence = tf.reduce_max(c_p, axis=0)
        # 最大概率對應的id
        class_id = tf.argmax(c_p, axis=0)
        confidence = confidence.numpy()
        class_id = class_id.numpy()

        pred_bb_info = [Pred_BB_Info(index=i,
                    class_id=class_id[i]-1,
                    confidence=confidence[i],
                    xyxy=[anc[i]]) # xyxy是個列表
                for i in range(pred_bb_num)]
        # 正類的index
        obj_bb_idx = [bb.index for bb
                in non_max_suppression(pred_bb_info,
                            nms_threshold)]
        output = []
        for bb in pred_bb_info:
            output.append(np.append([
                (bb.class_id if bb.index in obj_bb_idx
                        else -1.0),
                bb.confidence],
                bb.xyxy))

        return tf.convert_to_tensor(output) # shape: (錨框個數, 6)

    batch_output = []
    for b in range(bn):
        batch_output.append(MultiBoxDetection_one(cls_prob[b],
                        loc_pred[b], anchor[0],
                        nms_threshold))

    return tf.convert_to_tensor(batch_output)
output = MultiBoxDetection(
    tf.expand_dims(cls_probs, 0),
    tf.expand_dims(offset_preds, 0),
    tf.expand_dims(anchors, 0),
    nms_threshold=0.5)
print(output)
fig = plt.imshow(img)
list_label=[]
list_anchors=[]
for i in output[0].numpy():
    if i[0] == -1:
        continue
    if i[1]<0.5:
        continue
    label = ('girl=', 'boy=')[int(i[0])] + str(i[1])
    list_label.append(label)
    print(i,label)
    list_anchors.append((i[2:]))
    #show_bboxes(fig.axes, tf.multiply(i[2:], bbox_scale), label)

show_bboxes(fig.axes, list_anchors * bbox_scale,
        list_label)

多尺度目標檢測

設錨框大小爲0.15,特徵圖的高和寬分別爲4。

將特徵圖的高和寬分別減半,並用更大的錨框檢測更大的目標。當錨框大小設0.4時,有些錨框的區域有重合。

將特徵圖的高和寬進一步減半至1,並將錨框大小增至0.8。此時錨框中心即圖像中心。

小結:

  • 可以在多個尺度下生成不同數量和不同大小的錨框,從而在多個尺度下檢測不同大小的目標。
  • 特徵圖的形狀能確定任一圖像上均勻採樣的錨框中心。
  • 用輸入圖像在某個感受野區域內的信息來預測輸入圖像上與該區域相近的錨框的類別和偏移量。

其他內容:tf版本尚未更新,後續更新

 

 

 

 

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章