注:數據提取碼
代碼所需訓練數據
一、環境配置
python3.6
tensorflow-gpu1.12
Windows10
pycharm
二、代碼背景
**1、**結合Character-level Convolutional Networks for Text Classification這篇論文,參考了github上的代碼進行學習,github地址
**2、**本篇代碼尚未使用詞向量模型處理,通過簡單的統計出所有文本中高頻出現的5000個詞的個數,然後根據每篇文章中的詞得出相應的詞的id,組成一個文本的向量作爲輸入值
三、代碼
**注:**運行時,要在終端運行
(1)先運行python run_cnn.py train
(2)在運行python run_cnn.py test
(3)這個項目一共有三個文件分別爲 cnn_model.py(cnn搭建模型)、cnews_loader.py(信息加載)、run_cnn.py(運行文件)
(4)我們需要在同級目錄下創建data文件夾、Checkpoint文件夾、tensorboard文件夾
1、 cnn_model.py文件
import sys
import numpy as np
import tensorflow as tf
from collections import Counter
def read_file(filename):
#讀取文件數據,將每個文本中的每個字分隔開
contents,labels = [],[]
with open(filename,mode='r',encoding='utf-8',errors='ignore') as f:
for line in f:
try:
label,content = line.strip().split('\t')
if content:
contents.append(list(content))
#將當前文本中的每一個字作爲一個字符串分開
labels.append(label)
except:
pass
return contents,labels
def build_vocab(train_dir,vocab_dir,vocab_size = 5000):
#根據訓練集構建詞彙表存儲
data_train,_ = read_file(train_dir)
#將每篇文章分爲每一個字爲一個字符串的鏈表
all_data = []
for content in data_train:
all_data.extend(content)
#將所有文本中的內容放在一個鏈表中
#在all_data的數組中追加content列表中的內容,將所有內容都放在一個列表中
counter = Counter(all_data)
#統計每個字出現的次數
count_pairs = counter.most_common(vocab_size-1)
#返回的內容爲top(vocab_size-1)的字符和頻率
words,_ = list(zip(*count_pairs))
#將元組列表解壓爲列表,只取前top的
words = ['<PAD>']+list(words)
# 添加一個<PAD>來將所有的文本pad爲同一長度
open(vocab_dir,mode='w').write('\n'.join(words)+'\n')
def read_vocab(vocab_dir):
#讀取詞彙表
with open(vocab_dir,mode='r',encoding='utf-8',errors='ignore') as fp:
words = [words.strip() for words in fp.readlines()]
word_to_id = dict(zip(words,range(len(words))))
return words,word_to_id
def read_category():
#讀取分類目錄,固定
categories = categories = ['體育', '財經', '房產', '家居', '教育', '科技', '時尚', '時政', '遊戲', '娛樂']
categories = [content for content in categories]
cat_to_id = dict(zip(categories,range(len(categories))))
return categories,cat_to_id
def to_words(content,words):
#將id的內容轉化爲文字
return ''.join(words[x] for x in content)
def process_file(filename,word_to_id,cat_to_id,max_lenth = 600):
#將文件轉化爲id的表示
contents,labels = read_file(filename)
data_id,label_id = [],[]
for i in range(len(contents)):
data_id.append([word_to_id[x] for x in contents[i] if x in word_to_id])
label_id.append(cat_to_id[labels[i]])
x_pad = tf.keras.preprocessing.sequence.pad_sequences(data_id,max_lenth)
#當長度不夠時默認在開始補零
y_pad = tf.keras.utils.to_categorical(label_id,num_classes=len(cat_to_id))
#將每個字的序號作爲輸入的向量值
return x_pad,y_pad
def batch_iter(x,y,batch_size=64):
#生成批次數據
data_len = len(x)
num_batch = int((data_len-1)/batch_size)+1
indices = np.random.permutation(np.arange(data_len))
#得到一個data_len長度的隨機排列的數組,將x和y重新排列
x_shuffle = x[indices]
y_shuffle = y[indices]
for i in range(num_batch):
start_id = i*batch_size
end_id = min((i+1)*batch_size,data_len)
yield x_shuffle[start_id:end_id],y_shuffle[start_id:end_id]
2、cnews_load.py
import sys
import numpy as np
import tensorflow as tf
from collections import Counter
def read_file(filename):
#讀取文件數據
contents,labels = [],[]
with open(filename,mode='r',encoding='utf-8',errors='ignore') as fp:
for line in fp.readlines():
try:
label,content = line.strip().split('\t')
if content:
contents.append(list(content))
#將文本中的每個字作爲一個字符存入列表中
labels.append(label)
except:
pass
return contents,labels
def build_vocab(train_dir,vocab_dir,vocab_size = 5000):
#根據訓練集構建詞彙表存儲
data_train,_ = read_file(train_dir)
all_data = []
for content in data_train:
all_data.extend(content)
#將所有文本中的內容添加到列表中,後續統計所有字出現的次數
counter = Counter(all_data)
#統計每個字出現的次數
count_pairs = counter.most_common(vocab_size-1)
#找到出現頻率爲top(vocab_size-1)的值
words,_ = list(zip(*count_pairs))
#將元組列表解壓爲二維列表
words = ['<PAD>']+list(words)
# 添加一個<PAD>來將所有的文本pad爲同一長度
open(vocab_dir,mode='w').write('\n'.join(words)+'\n')
def read_vocab(vocab_dir):
#讀取詞彙表
with open(vocab_dir,mode='r',encoding='utf-8',errors='ignore') as fp:
words = [words.strip() for words in fp.readlines()]
word_to_id = dict(zip(words,range(len(words))))
return words,word_to_id
def read_category():
#讀取分類目錄,固定
categories = categories = ['體育', '財經', '房產', '家居', '教育', '科技', '時尚', '時政', '遊戲', '娛樂']
categories = [content for content in categories]
cat_to_id = dict(zip(categories,range(len(categories))))
return categories,cat_to_id
def to_words(content,words):
#將id的內容轉化爲文字
return ''.join(words[x] for x in content)
def process_file(filename,word_to_id,cat_to_id,max_lenth = 600):
#將文件轉化爲id的表示
contents,labels = read_file(filename)
data_id,label_id = [],[]
for i in range(len(contents)):
data_id.append([word_to_id[x] for x in contents[i] if x in word_to_id])
label_id.append(cat_to_id[labels[i]])
x_pad = tf.keras.preprocessing.sequence.pad_sequences(data_id,max_lenth)
#當長度不夠時默認在開始補零
y_pad = tf.keras.utils.to_categorical(label_id,num_classes=len(cat_to_id))
#將每個字的序號作爲輸入的向量值
return x_pad,y_pad
def batch_iter(x,y,batch_size=64):
#生成批次數據
data_len = len(x)
num_batch = int((data_len-1)/batch_size)+1
indices = np.random.permutation(np.arange(data_len))
#得到一個data_len長度的隨機排列的數組,將x和y重新排列
x_shuffle = x[indices]
y_shuffle = y[indices]
for i in range(num_batch):
start_id = i*batch_size
end_id = min((i+1)*batch_size,data_len)
yield x_shuffle[start_id:end_id],y_shuffle[start_id:end_id]
3、run_cnn.py
from __future__ import print_function
import os
import sys
import time
import warnings
warnings.filterwarnings('ignore')
from datetime import timedelta
import numpy as np
import tensorflow as tf
from sklearn import metrics
from cnn_model import TCNNConfig,TextCNN
from cnews_loader import read_vocab,read_category,batch_iter,process_file,build_vocab
train_dir = './data/cnews.train.txt'
test_dir = './data/cnews.test.txt'
val_dir = './data/cnews.val.txt'
vocab_dir = './data/cnews.vocab.txt'
save_dir = './Checkpoint'
save_path = './Checkpoint/best_validation'
def get_time_dif(start_time):
#獲取已使用的時間
end_time = time.time()
time_dif = end_time - start_time
return timedelta(seconds = int(round(time_dif)))
def feed_data(x_batch,y_batch,keep_prob):
feed_dict ={
model.input_x:x_batch,
model.input_y:y_batch,
model.keep_prob:keep_prob
}
return feed_dict
def evaluate(sess,x,y):
#評估在某一數據上的準確率和損失
data_len = len(x)
batch_eval = batch_iter(x,y,128)
total_loss = 0.0
total_acc = 0.0
for x_batch,y_batch in batch_eval:
batch_len = len(x_batch)
feed_dict = feed_data(x_batch,y_batch,1.0)
loss,acc = sess.run([model.loss,model.acc],feed_dict=feed_dict)
total_loss += loss*batch_len
total_acc += acc*batch_len
return total_loss/data_len,total_acc/data_len
def train():
print('Configuring TensorBoard and Saver')
#每次訓練之前需要將tensorboard文件夾清空,否則會覆蓋
tensorboard_dir = './tensorboard'
if not os.path.exists(tensorboard_dir):
os.makedirs(tensorboard_dir)
tf.summary.scalar("loss",model.loss)
tf.summary.scalar("accuracy",model.acc)
merged_summary = tf.summary.merge_all()
writer = tf.summary.FileWriter(tensorboard_dir)
#配置Saver
saver = tf.train.Saver()
if not os.path.exists(save_dir):
os.makedirs(save_dir)
print('Loading training and validation data....')
#載入訓練集和驗證集
start_time = time.time()
x_train,y_train = process_file(train_dir,word_to_id,cat_to_id,config.seq_length)
x_val,y_val = process_file(val_dir,word_to_id,cat_to_id,config.seq_length)
time_dif = get_time_dif(start_time)
print('Time usage:',time_dif)
#創建session
session = tf.Session()
session.run(tf.global_variables_initializer())
writer.add_graph(session.graph)
print('training and evaluating....')
start_time = time.time()
total_batch = 0 #總批次
best_acc_val = 0.0 #最佳驗證集準確率
last_improved = 0 #記錄上次提升批次
require_improvement= 1000 #如果超過一千輪未提升則提前結束
flag = False
for epoch in range(config.num_epochs):
print('Epoch',epoch+1)
batch_train = batch_iter(x_train,y_train,config.batch_size)
for x_batch,y_batch in batch_train:
feed_dict = feed_data(x_batch,y_batch,config.dropout_keep_prob)
if total_batch%config.save_per_batch == 0:
#每多少輪次將訓練結果寫入tensorboard scalar
s = session.run(merged_summary,feed_dict=feed_dict)
writer.add_summary(s,total_batch)
if total_batch % config.print_per_batch == 0:
# 每多少輪次輸出在訓練集和驗證集上的性能
feed_dict[model.keep_prob] = 1.0
loss_train, acc_train = session.run([model.loss, model.acc], feed_dict=feed_dict)
loss_val, acc_val = evaluate(session, x_val, y_val) # todo
if acc_val > best_acc_val:
# 保存最好結果
best_acc_val = acc_val
last_improved = total_batch
saver.save(sess=session, save_path=save_path)
improved_str = '*'
else:
improved_str = ''
time_dif = get_time_dif(start_time)
msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' \
+ ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}'
print(msg.format(total_batch,loss_train,acc_train,loss_val,acc_val,time_dif,improved_str))
feed_dict[model.keep_prob] = config.dropout_keep_prob
session.run(model.optim, feed_dict=feed_dict) # 運行優化
total_batch += 1
if total_batch - last_improved > require_improvement:
# 驗證集正確率長期不提升,提前結束訓練
print("No optimization for a long time, auto-stopping...")
flag = True
break # 跳出循環
if flag: # 同上
break
def test():
print("Loading test data...")
start_time = time.time()
x_test, y_test = process_file(test_dir, word_to_id, cat_to_id, config.seq_length)
session = tf.Session()
session.run(tf.global_variables_initializer())
saver = tf.train.Saver()
saver.restore(sess=session, save_path=save_path) # 讀取保存的模型
print('Testing...')
loss_test, acc_test = evaluate(session, x_test, y_test)
msg = 'Test Loss: {0:>6.2}, Test Acc: {1:>7.2%}'
print(msg.format(loss_test, acc_test))
batch_size = 128
data_len = len(x_test)
num_batch = int((data_len - 1) / batch_size) + 1
y_test_cls = np.argmax(y_test, 1)
y_pred_cls = np.zeros(shape=len(x_test), dtype=np.int32) # 保存預測結果
for i in range(num_batch): # 逐批次處理
start_id = i * batch_size
end_id = min((i + 1) * batch_size, data_len)
feed_dict = {
model.input_x: x_test[start_id:end_id],
model.keep_prob: 1.0
}
y_pred_cls[start_id:end_id] = session.run(model.y_pred_cls, feed_dict=feed_dict)
# 評估
print("Precision, Recall and F1-Score...")
print(metrics.classification_report(y_test_cls, y_pred_cls, target_names=categories))
# 混淆矩陣
print("Confusion Matrix...")
cm = metrics.confusion_matrix(y_test_cls, y_pred_cls)
print(cm)
time_dif = get_time_dif(start_time)
print("Time usage:", time_dif)
if __name__ == '__main__':
if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']:
raise ValueError("""usage: python run_cnn.py [train / test]""")
print('Configuring CNN model...')
config = TCNNConfig()
if not os.path.exists(vocab_dir): # 如果不存在詞彙表,重建
build_vocab(train_dir, vocab_dir, config.vocab_size)
categories, cat_to_id = read_category()
words, word_to_id = read_vocab(vocab_dir)
config.vocab_size = len(words)
model = TextCNN(config)
if sys.argv[1] == 'train':
train()
else:
test()