0:生成 LMDB 的流程
- 已有的 json 數據集,可以通過 labelme 等開源工具標註,或者自己寫腳本生成
- 將 json 文件轉成 voc2007 格式的文件(labelme 格式轉 VOC2007 數據集格式)
- 設置個人的 labelmap.prototxt
- 借用ssd-caffe的 create_list.sh 腳本生成待轉寫格式文件
- 借用ssd-caffe的 create_data.sh 腳本生成 LMDB 格式文件
1:製作自己的數據集
這裏推薦幾款好用的標註工具
6. labelme:安裝簡單,支持標定關鍵點,分割等,非常好用,生成json格式的標籤文件。
格式:下面列出的標籤參數都是必須的,否則labelme無法正常識別
{
"shapes": [
{
"shape_type": "polygon",
"line_color": null,
"points": [
[ 634, 276 ],
[ 703, 275 ],
[ 705, 312 ],
[ 635, 313 ]
],
"fill_color": null,
"label": "traffic-4"
},
{
"shape_type": "polygon",
"line_color": null,
"points": [
[ 715, 275 ],
[ 785, 274 ],
[ 786, 313 ],
[ 716, 312 ]
],
"fill_color": null,
"label": "traffic-4-occ-largely"
}
],
"lineColor": [ 0, 255, 0, 128 ],
"imagePath": "2012-3-23_20-23-25_0.jpg",
"fillColor": [ 255, 0, 0, 128 ],
"imageData": null
}
- labelImg:安裝簡單,非常方便的畫框的標定工具。
支持PASCAL VOC格式的XML標籤。 - 其他:待續。。。
2 :JSON 2 VOC2007
# -*- coding: utf-8 -*-
import os, re
import json
import cv2
import numpy as np
import codecs
from glob import glob
import shutil
from sklearn.model_selection import train_test_split
def iter_files(data_root_path, saved_path):
count = 0
for root,dirs,files in os.walk(data_root_path):
for json_file in files:
if re.search(".json", json_file):
file_name = json_file[0:json_file.find(".json")]
file_path = os.path.join(root, json_file)
count += 1
print("====================================================================")
print(count)
print(file_path)
# json 轉 voc2007
json2voc2007(file_name, root, saved_path)
for dirname in dirs:
iter_files(dirname, saved_path)
def json2voc2007(json_file_,labelme_path,saved_path):
json_filename = os.path.join(labelme_path, json_file_ + ".json")
json_file = json.load(open(json_filename, "r"))
height, width, channels = cv2.imread(os.path.join(labelme_path, json_file_ + ".jpg")).shape
with codecs.open(saved_path + "Annotations/" + json_file_ + ".xml", "w", "utf-8") as xml:
xml.write('<annotation>\n')
xml.write('\t<folder>' + 'TrafficSign' + '</folder>\n')
xml.write('\t<filename>' + json_file_ + ".jpg" + '</filename>\n')
xml.write('\t<source>\n')
xml.write('\t\t<database>The UAV autolanding</database>\n')
xml.write('\t\t<annotation>UAV AutoLanding</annotation>\n')
xml.write('\t\t<image>flickr</image>\n')
xml.write('\t\t<flickrid>NULL</flickrid>\n')
xml.write('\t</source>\n')
xml.write('\t<owner>\n')
xml.write('\t\t<flickrid>NULL</flickrid>\n')
xml.write('\t\t<name>TrafficSign</name>\n')
xml.write('\t</owner>\n')
xml.write('\t<size>\n')
xml.write('\t\t<width>' + str(width) + '</width>\n')
xml.write('\t\t<height>' + str(height) + '</height>\n')
xml.write('\t\t<depth>' + str(channels) + '</depth>\n')
xml.write('\t</size>\n')
xml.write('\t\t<segmented>0</segmented>\n')
for multi in json_file["shapes"]:
label = multi["label"]
# 下面這個if是我添加的一個標籤篩選的判斷
if label == "traffic-3" or \
label == "traffic-3-occ-partially":
points = np.array(multi["points"])
xmin = min(points[:, 0])
xmax = max(points[:, 0])
ymin = min(points[:, 1])
ymax = max(points[:, 1])
if xmax <= xmin:
pass
elif ymax <= ymin:
pass
else:
xml.write('\t<object>\n')
xml.write('\t\t<name>' + label + '</name>\n')
xml.write('\t\t<pose>Unspecified</pose>\n')
xml.write('\t\t<truncated>1</truncated>\n')
xml.write('\t\t<difficult>0</difficult>\n')
xml.write('\t\t<bndbox>\n')
xml.write('\t\t\t<xmin>' + str(xmin) + '</xmin>\n')
xml.write('\t\t\t<ymin>' + str(ymin) + '</ymin>\n')
xml.write('\t\t\t<xmax>' + str(xmax) + '</xmax>\n')
xml.write('\t\t\t<ymax>' + str(ymax) + '</ymax>\n')
xml.write('\t\t</bndbox>\n')
xml.write('\t</object>\n')
print(json_filename, xmin, ymin, xmax, ymax, label)
xml.write('</annotation>')
# 5.複製圖片到 VOC2007/JPEGImages/下
image = glob(labelme_path + "/" + json_file_ + ".jpg")
print("copy image files to VOC007/JPEGImages/")
shutil.copyfile(image[0], saved_path + "JPEGImages/" + json_file_ + ".jpg")
# 6.split files for txt
txtsavepath = saved_path + "ImageSets/Main/"
ftrainval = open(txtsavepath + '/trainval.txt', 'w')
ftest = open(txtsavepath + '/test.txt', 'w')
ftrain = open(txtsavepath + '/train.txt', 'w')
fval = open(txtsavepath + '/val.txt', 'w')
total_files = glob("./TrafficSign/Annotations/*.xml")
total_files = [i.split("/")[-1].split(".xml")[0] for i in total_files]
# test_filepath = ""
for file in total_files:
ftrainval.write(file + "\n")
# test
# for file in os.listdir(test_filepath):
# ftest.write(file.split(".jpg")[0] + "\n")
# split
# test_size 設置train:val的劃分比例
train_files, val_files = train_test_split(total_files, test_size=0.10, random_state=42)
# train
for file in train_files:
ftrain.write(file + "\n")
# val
for file in val_files:
fval.write(file + "\n")
ftrainval.close()
ftrain.close()
fval.close()
# ftest.close()
def main():
# 1.標籤路徑
saved_path = "./VOC2007/" # 保存路徑
# 2.創建要求文件夾
if not os.path.exists(saved_path + "Annotations"):
os.makedirs(saved_path + "Annotations")
if not os.path.exists(saved_path + "JPEGImages/"):
os.makedirs(saved_path + "JPEGImages/")
if not os.path.exists(saved_path + "ImageSets/Main/"):
os.makedirs(saved_path + "ImageSets/Main/")
data_root_path = "./data/"
# 3. 迭代查詢各級文件夾
iter_files(data_root_path, saved_path)
if __name__ == '__main__':
main()
3:labelmap.prototxt 設定(以背景,目標兩類爲例)
item {
name: "none_of_the_above"
label: 0
display_name: "background"
}
item {
name: "face"
label: 1
display_name: "face"
}
4:create_list.sh
這裏主要注意,root_dir 要修改爲自己 VOC2007 的路徑
#!/bin/bash
root_dir=$HOME/data/VOC2007
sub_dir=ImageSets/Main
echo $(dirname "${BASH_SOURCE[0]}")
bash_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
for dataset in val
do
dst_file=$bash_dir/$dataset.txt
if [ -f $dst_file ]
then
rm -f $dst_file
fi
echo "Create list for $dataset..."
echo $root_dir/$sub_dir/$dataset.txt
dataset_file=$root_dir/$sub_dir/$dataset.txt
img_file=$bash_dir/$dataset"_img.txt"
cp $dataset_file $img_file
sed -i "s/^/\/JPEGImages\//g" $img_file
sed -i "s/$/.jpg/g" $img_file
label_file=$bash_dir/$dataset"_label.txt"
cp $dataset_file $label_file
sed -i "s/^/\/Annotations\//g" $label_file
sed -i "s/$/.xml/g" $label_file
paste -d' ' $img_file $label_file >> $dst_file
rm -f $label_file
rm -f $img_file
# Generate image name and size infomation.
if [ $dataset == "val" ]
then
$bash_dir/../../build/tools/get_image_size $root_dir $dst_file $bash_dir/$dataset"_name_size.txt"
fi
# Shuffle trainval file.
if [ $dataset == "train" ]
then
rand_file=$dst_file.random
cat $dst_file | perl -MList::Util=shuffle -e 'print shuffle(<STDIN>);' > $rand_file
mv $rand_file $dst_file
fi
done
5:create_data.sh
cur_dir=$(cd $( dirname ${BASH_SOURCE[0]} ) && pwd )
echo $cur_dir
root_dir=$cur_dir/../..
cd $root_dir
redo=1
data_root_dir="$HOME/data/VOC2007"
dataset_name="DataName"
mapfile="$root_dir/data/$dataset_name/labelmap.prototxt"
anno_type="detection"
db="lmdb"
min_dim=0
max_dim=0
width=0
height=0
extra_cmd="--encode-type=jpg --encoded"
if [ $redo ]
then
extra_cmd="$extra_cmd --redo"
fi
for subset in train
do
sudo python2 $root_dir/scripts/create_annoset.py --anno-type=$anno_type --label-map-file=$mapfile --min-dim=$min_dim --max-dim=$max_dim --resize-width=$width --resize-height=$height --check-label $extra_cmd $data_root_dir $root_dir/data/$dataset_name/$subset.txt $data_root_dir/$dataset_name/$db/$dataset_name"_"$subset"_"$db examples/$dataset_name
done