搜狐新聞文本分類之CNN(tensorflow版Version1)

注:數據提取碼
代碼所需訓練數據

一、環境配置

python3.6
tensorflow-gpu1.12
Windows10
pycharm

二、代碼背景

**1、**結合Character-level Convolutional Networks for Text Classification這篇論文,參考了github上的代碼進行學習,github地址
**2、**本篇代碼尚未使用詞向量模型處理,通過簡單的統計出所有文本中高頻出現的5000個詞的個數,然後根據每篇文章中的詞得出相應的詞的id,組成一個文本的向量作爲輸入值

三、代碼

**注:**運行時,要在終端運行
(1)先運行python run_cnn.py train
(2)在運行python run_cnn.py test
(3)這個項目一共有三個文件分別爲 cnn_model.py(cnn搭建模型)、cnews_loader.py(信息加載)、run_cnn.py(運行文件)
(4)我們需要在同級目錄下創建data文件夾、Checkpoint文件夾、tensorboard文件夾
在這裏插入圖片描述

1、 cnn_model.py文件

import sys
import numpy as np
import tensorflow as tf
from collections import Counter


def read_file(filename):
    #讀取文件數據,將每個文本中的每個字分隔開
    contents,labels = [],[]
    with open(filename,mode='r',encoding='utf-8',errors='ignore') as f:
        for line in f:
            try:
                label,content = line.strip().split('\t')
                if content:
                    contents.append(list(content))
                    #將當前文本中的每一個字作爲一個字符串分開
                    labels.append(label)
            except:
                pass
    return contents,labels

def build_vocab(train_dir,vocab_dir,vocab_size = 5000):
    #根據訓練集構建詞彙表存儲
    data_train,_ = read_file(train_dir)
    #將每篇文章分爲每一個字爲一個字符串的鏈表
    all_data = []
    for content in data_train:
        all_data.extend(content)
        #將所有文本中的內容放在一個鏈表中
        #在all_data的數組中追加content列表中的內容,將所有內容都放在一個列表中

    counter = Counter(all_data)
    #統計每個字出現的次數
    count_pairs = counter.most_common(vocab_size-1)
    #返回的內容爲top(vocab_size-1)的字符和頻率
    words,_ = list(zip(*count_pairs))
    #將元組列表解壓爲列表,只取前top的
    words = ['<PAD>']+list(words)
    # 添加一個<PAD>來將所有的文本pad爲同一長度
    open(vocab_dir,mode='w').write('\n'.join(words)+'\n')

def read_vocab(vocab_dir):
    #讀取詞彙表
    with open(vocab_dir,mode='r',encoding='utf-8',errors='ignore') as fp:
        words = [words.strip() for words in fp.readlines()]
    word_to_id = dict(zip(words,range(len(words))))
    return words,word_to_id

def read_category():
    #讀取分類目錄,固定
    categories = categories = ['體育', '財經', '房產', '家居', '教育', '科技', '時尚', '時政', '遊戲', '娛樂']
    categories = [content for content in categories]

    cat_to_id = dict(zip(categories,range(len(categories))))
    return categories,cat_to_id

def to_words(content,words):
    #將id的內容轉化爲文字
    return ''.join(words[x] for x in content)

def process_file(filename,word_to_id,cat_to_id,max_lenth = 600):
    #將文件轉化爲id的表示
    contents,labels = read_file(filename)
    data_id,label_id = [],[]
    for i in range(len(contents)):
        data_id.append([word_to_id[x] for x in contents[i] if x in word_to_id])
        label_id.append(cat_to_id[labels[i]])

    x_pad = tf.keras.preprocessing.sequence.pad_sequences(data_id,max_lenth)
    #當長度不夠時默認在開始補零
    y_pad = tf.keras.utils.to_categorical(label_id,num_classes=len(cat_to_id))
    #將每個字的序號作爲輸入的向量值
    return x_pad,y_pad

def batch_iter(x,y,batch_size=64):
    #生成批次數據
    data_len = len(x)
    num_batch = int((data_len-1)/batch_size)+1

    indices = np.random.permutation(np.arange(data_len))
    #得到一個data_len長度的隨機排列的數組,將x和y重新排列
    x_shuffle = x[indices]
    y_shuffle = y[indices]

    for i in range(num_batch):
        start_id = i*batch_size
        end_id = min((i+1)*batch_size,data_len)

        yield x_shuffle[start_id:end_id],y_shuffle[start_id:end_id]

2、cnews_load.py

import sys
import numpy as np
import tensorflow as tf
from collections import Counter


def read_file(filename):
    #讀取文件數據
    contents,labels = [],[]
    with open(filename,mode='r',encoding='utf-8',errors='ignore') as fp:
        for line in fp.readlines():
            try:
                label,content = line.strip().split('\t')
                if content:
                    contents.append(list(content))
                    #將文本中的每個字作爲一個字符存入列表中
                    labels.append(label)
            except:
                pass
    return contents,labels

def build_vocab(train_dir,vocab_dir,vocab_size = 5000):
    #根據訓練集構建詞彙表存儲
    data_train,_ = read_file(train_dir)
    all_data = []
    for content in data_train:
        all_data.extend(content)
        #將所有文本中的內容添加到列表中,後續統計所有字出現的次數

    counter = Counter(all_data)
    #統計每個字出現的次數
    count_pairs = counter.most_common(vocab_size-1)
    #找到出現頻率爲top(vocab_size-1)的值
    words,_ = list(zip(*count_pairs))
    #將元組列表解壓爲二維列表
    words = ['<PAD>']+list(words)
    # 添加一個<PAD>來將所有的文本pad爲同一長度
    open(vocab_dir,mode='w').write('\n'.join(words)+'\n')

def read_vocab(vocab_dir):
    #讀取詞彙表
    with open(vocab_dir,mode='r',encoding='utf-8',errors='ignore') as fp:
        words = [words.strip() for words in fp.readlines()]
    word_to_id = dict(zip(words,range(len(words))))
    return words,word_to_id

def read_category():
    #讀取分類目錄,固定
    categories = categories = ['體育', '財經', '房產', '家居', '教育', '科技', '時尚', '時政', '遊戲', '娛樂']
    categories = [content for content in categories]

    cat_to_id = dict(zip(categories,range(len(categories))))
    return categories,cat_to_id

def to_words(content,words):
    #將id的內容轉化爲文字
    return ''.join(words[x] for x in content)

def process_file(filename,word_to_id,cat_to_id,max_lenth = 600):
    #將文件轉化爲id的表示
    contents,labels = read_file(filename)
    data_id,label_id = [],[]
    for i in range(len(contents)):
        data_id.append([word_to_id[x] for x in contents[i] if x in word_to_id])
        label_id.append(cat_to_id[labels[i]])

    x_pad = tf.keras.preprocessing.sequence.pad_sequences(data_id,max_lenth)
    #當長度不夠時默認在開始補零
    y_pad = tf.keras.utils.to_categorical(label_id,num_classes=len(cat_to_id))
    #將每個字的序號作爲輸入的向量值
    return x_pad,y_pad

def batch_iter(x,y,batch_size=64):
    #生成批次數據
    data_len = len(x)
    num_batch = int((data_len-1)/batch_size)+1

    indices = np.random.permutation(np.arange(data_len))
    #得到一個data_len長度的隨機排列的數組,將x和y重新排列
    x_shuffle = x[indices]
    y_shuffle = y[indices]

    for i in range(num_batch):
        start_id = i*batch_size
        end_id = min((i+1)*batch_size,data_len)

        yield x_shuffle[start_id:end_id],y_shuffle[start_id:end_id]

3、run_cnn.py

from __future__ import print_function

import os
import sys
import time
import warnings
warnings.filterwarnings('ignore')
from datetime import timedelta

import numpy as np
import tensorflow as tf
from sklearn import metrics

from cnn_model import TCNNConfig,TextCNN
from cnews_loader import read_vocab,read_category,batch_iter,process_file,build_vocab

train_dir = './data/cnews.train.txt'
test_dir = './data/cnews.test.txt'
val_dir = './data/cnews.val.txt'
vocab_dir = './data/cnews.vocab.txt'

save_dir = './Checkpoint'
save_path = './Checkpoint/best_validation'

def get_time_dif(start_time):
    #獲取已使用的時間
    end_time = time.time()
    time_dif = end_time - start_time
    return timedelta(seconds = int(round(time_dif)))

def feed_data(x_batch,y_batch,keep_prob):
    feed_dict ={
        model.input_x:x_batch,
        model.input_y:y_batch,
        model.keep_prob:keep_prob
    }
    return feed_dict

def evaluate(sess,x,y):
    #評估在某一數據上的準確率和損失
    data_len = len(x)
    batch_eval = batch_iter(x,y,128)
    total_loss = 0.0
    total_acc = 0.0

    for x_batch,y_batch in batch_eval:
        batch_len = len(x_batch)
        feed_dict = feed_data(x_batch,y_batch,1.0)
        loss,acc = sess.run([model.loss,model.acc],feed_dict=feed_dict)
        total_loss += loss*batch_len
        total_acc += acc*batch_len

    return total_loss/data_len,total_acc/data_len

def train():
    print('Configuring TensorBoard and Saver')
    #每次訓練之前需要將tensorboard文件夾清空,否則會覆蓋
    tensorboard_dir = './tensorboard'
    if not os.path.exists(tensorboard_dir):
        os.makedirs(tensorboard_dir)

    tf.summary.scalar("loss",model.loss)
    tf.summary.scalar("accuracy",model.acc)
    merged_summary = tf.summary.merge_all()
    writer = tf.summary.FileWriter(tensorboard_dir)

    #配置Saver
    saver = tf.train.Saver()
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    print('Loading training and validation data....')
    #載入訓練集和驗證集
    start_time = time.time()
    x_train,y_train = process_file(train_dir,word_to_id,cat_to_id,config.seq_length)
    x_val,y_val = process_file(val_dir,word_to_id,cat_to_id,config.seq_length)
    time_dif = get_time_dif(start_time)
    print('Time usage:',time_dif)

    #創建session
    session = tf.Session()
    session.run(tf.global_variables_initializer())
    writer.add_graph(session.graph)

    print('training and evaluating....')
    start_time = time.time()
    total_batch = 0 #總批次
    best_acc_val = 0.0 #最佳驗證集準確率
    last_improved = 0 #記錄上次提升批次
    require_improvement= 1000 #如果超過一千輪未提升則提前結束

    flag = False
    for epoch in range(config.num_epochs):
        print('Epoch',epoch+1)
        batch_train = batch_iter(x_train,y_train,config.batch_size)
        for x_batch,y_batch in batch_train:
            feed_dict = feed_data(x_batch,y_batch,config.dropout_keep_prob)

            if total_batch%config.save_per_batch ==  0:
                #每多少輪次將訓練結果寫入tensorboard scalar
                s = session.run(merged_summary,feed_dict=feed_dict)
                writer.add_summary(s,total_batch)

                if total_batch % config.print_per_batch == 0:
                    # 每多少輪次輸出在訓練集和驗證集上的性能
                    feed_dict[model.keep_prob] = 1.0
                    loss_train, acc_train = session.run([model.loss, model.acc], feed_dict=feed_dict)
                    loss_val, acc_val = evaluate(session, x_val, y_val)  # todo
                    if acc_val > best_acc_val:
                        # 保存最好結果
                        best_acc_val = acc_val
                        last_improved = total_batch
                        saver.save(sess=session, save_path=save_path)
                        improved_str = '*'
                    else:
                        improved_str = ''
                    time_dif = get_time_dif(start_time)
                    msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' \
                          + ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}'
                    print(msg.format(total_batch,loss_train,acc_train,loss_val,acc_val,time_dif,improved_str))

            feed_dict[model.keep_prob] = config.dropout_keep_prob
            session.run(model.optim, feed_dict=feed_dict)  # 運行優化
            total_batch += 1
            if total_batch - last_improved > require_improvement:
                # 驗證集正確率長期不提升,提前結束訓練
                print("No optimization for a long time, auto-stopping...")
                flag = True
                break  # 跳出循環
        if flag:  # 同上
            break

def test():
    print("Loading test data...")
    start_time = time.time()
    x_test, y_test = process_file(test_dir, word_to_id, cat_to_id, config.seq_length)
    session = tf.Session()
    session.run(tf.global_variables_initializer())
    saver = tf.train.Saver()
    saver.restore(sess=session, save_path=save_path)  # 讀取保存的模型
    print('Testing...')
    loss_test, acc_test = evaluate(session, x_test, y_test)
    msg = 'Test Loss: {0:>6.2}, Test Acc: {1:>7.2%}'
    print(msg.format(loss_test, acc_test))
    batch_size = 128
    data_len = len(x_test)
    num_batch = int((data_len - 1) / batch_size) + 1
    y_test_cls = np.argmax(y_test, 1)
    y_pred_cls = np.zeros(shape=len(x_test), dtype=np.int32)  # 保存預測結果
    for i in range(num_batch):  # 逐批次處理
        start_id = i * batch_size
        end_id = min((i + 1) * batch_size, data_len)
        feed_dict = {
            model.input_x: x_test[start_id:end_id],
            model.keep_prob: 1.0
        }
        y_pred_cls[start_id:end_id] = session.run(model.y_pred_cls, feed_dict=feed_dict)
    # 評估
    print("Precision, Recall and F1-Score...")
    print(metrics.classification_report(y_test_cls, y_pred_cls, target_names=categories))
    # 混淆矩陣
    print("Confusion Matrix...")
    cm = metrics.confusion_matrix(y_test_cls, y_pred_cls)
    print(cm)
    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)

if __name__ == '__main__':
    if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']:
        raise ValueError("""usage: python run_cnn.py [train / test]""")
    print('Configuring CNN model...')
    config = TCNNConfig()
    if not os.path.exists(vocab_dir):  # 如果不存在詞彙表,重建
        build_vocab(train_dir, vocab_dir, config.vocab_size)
    categories, cat_to_id = read_category()
    words, word_to_id = read_vocab(vocab_dir)
    config.vocab_size = len(words)
    model = TextCNN(config)

    if sys.argv[1] == 'train':
        train()
    else:
        test()
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章