Torchtext處理Fasttext實例

鏈接:https://pan.baidu.com/s/17IhfiYTfo0UeLoir1h3kjA

提取碼:px2z

import torch
import torch.nn as nn
import torch.optim as optim

from tqdm import tqdm
from torchtext import data
from torchtext.vocab import Vectors
from torchtext.data import Iterator,BucketIterator,TabularDataset

import os
import random
import logging
import numpy as np
import pandas as pd


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# 返回需要的Dataset用於構造迭代器
# 數據格式如下,每一條數據有三列,第一列爲標籤,第二列爲title,第三列爲content:
class MyDataset(data.Dataset):
    def __init__(self, path, text_field, label_field, test=False, aug=False, **kwargs):
        fields = [("label",label_field), ("title",None), ("text",text_field)]
        examples = []
        csv_data = pd.read_csv(path)
        print('read data from {}'.format(path))
        if test:
            # 如果爲測試集,加載label用於驗證測試集準確率,否則在傳入參數時也需將label設爲None
            for label,text in tqdm(zip(csv_data['label'], csv_data['text'])):
                examples.append(data.Example.fromlist([label,None,text],fields))
                #print(vars(examples[len(examples)-1]))
        else:
            for label,text in tqdm(zip(csv_data['label'], csv_data['text'])):
                if aug:
                    rate = random.random()
                    if rate > 0.5:
                        text = self.dropout(text)
                    else:
                        text = self.shuffle(text)
                # Example: Defines a single training or test example.Stores each column of the example as an attribute.
                examples.append(data.Example.fromlist([label,None,text], fields))
        # 之前是一些預處理操作,此處調用super調用父類構造方法,產生標準Dataset
        # super(MyDataset, self).__init__(examples, fields, **kwargs)
        super(MyDataset, self).__init__(examples, fields)

    def shuffle(self, text):
        #將數組的內容進行順序打亂
        text = np.random.permutation(text.strip().split())
        return ' '.join(text)
    def dropout(self, text, p=0.5):
        # random delete some text
        text = text.strip().split()
        len_ = len(text)
        indexs = np.random.choice(len_, int(len_ * p))
        for i in indexs:
            text[i] = ''
        return ' '.join(text)
		
		
class Fasttext(nn.Module):
    '''
    vocab:訓練集的字典
    vec_dim:詞向量的維度
    label_size:類別數量
    hidden_size:隱藏層神經元的數量
    '''
    def __init__(self,vocab,vec_dim,label_size,hidden_size):
        super(Fasttext,self).__init__()
        #創建詞嵌套的向量
        self.embedding = nn.Embedding(len(vocab),vec_dim)
        #若使用預訓練的向量,需在此處指定預訓練的權重
        self.embedding.weight.data.copy_(vocab.vectors)
        self.embedding.weight.requires_grad = True
        self.fc = nn.Sequential(
            nn.Linear(vec_dim,hidden_size),
            nn.BatchNorm1d(hidden_size),
            #hidden_size表示期望的特徵數或者上一層的維數
            nn.ReLU(inplace=True),
            nn.Linear(hidden_size,label_size)
        )

    def forward(self,text):
        text = self.embedding(text)
        result = self.fc(torch.mean(text,dim = 1))
        return result

def get_data_iter(train_csv,test_csv,fix_length):
    Text = data.Field(sequential=True,lower=True,fix_length=fix_length,batch_first=True)
    Label = data.Field(sequential=False,use_vocab=False)

    #train_fields = [('label',Label),('title',None),('text',Text)]
    #train = TabularDataset(path=train_csv,format='csv',fields=train_fields,skip_header=True)
    #TabularDataset總是超出範圍
    #預處理之後得數據集
    train = MyDataset(path=train_csv, text_field=Text, label_field=Label, test=False, aug=True)
    test = MyDataset(path=test_csv, text_field=Text, label_field=Label, test=True, aug=True)

    vectors = Vectors(name=word2vec_dir)
    Text.build_vocab(train, vectors=vectors)
    vocab = Text.vocab

    #test_fields = [('label',Label),('title',None),('text',Text)]
    #test = TabularDataset(path=test_csv,format='csv',fields=test_fields,skip_header=True)
    train_iter,test_iter = BucketIterator.splits(
        (train,test),
        batch_size=batch_size,
        device=device,
        sort_key=lambda x: len(x.text),
        sort_within_batch=False,
        repeat=False)
    #train_iter = BucketIterator(train, batch_size=batch_size, device=device, sort_key=lambda x: len(x.text),sort_within_batch=False, repeat=False)
    #test_iter = Iterator(test,batch_size=batch_size,device = device,sort = False,sort_within_batch = False,repeat=False)
    #print((next(iter(train_iter)).text))
    return train_iter,test_iter,vocab


def train_model(model,train_iter,epoch,lr,batch_size):
    model.train()
    #將模型設置爲訓練模式
    optimizer = optim.Adam(model.parameters(),lr=lr)
    criterion = nn.CrossEntropyLoss()
    model.to(device)
    criterion.to(device)
    for i in range(epoch):
        #多批次循環
        for batch_idx,batch in enumerate(train_iter):
        #注意target = batch.label-1,因爲數據集中的label是1,2,3,4,但是pytorch的label默認是從0開始的
            data,target = batch.text,batch.label-1
            optimizer.zero_grad()   #清除所有優化的梯度
            output = model(data)  #傳入數據並前向傳播獲取輸出
            loss = criterion(output,target)
            loss.backward()
            optimizer.step()

            #打印狀態信息
            if batch_idx%500 == 0 :
                print('train_epoch:{},batch_id:{},loss:{}'.format((i),(batch_idx),(loss.item()/batch_size)))
    print('Finished Training')

def model_test(model,test_iter):
    model.eval()
    model.to(device)
    #將模型設置爲評估模式
    correct = 0
    total = 0
    with torch.no_grad():
        #i爲batch的下標
        print((next(iter(test_iter)).text))
        for i,batch in enumerate(test_iter):
            data,label = batch.text,batch.label - 1
            print('test_batch_id:{}'.format(i))
            outputs = model(data)
            _,predicted = torch.max(outputs.data,1)
            #torch.max()[0]表示返回最大的值,torch.max()[1]表示返回最大值的每個索引
            total += label.size(0)
            correct += (predicted == label).sum().item()
            print('Accuracy of the network on test set:{}'.format(100*correct/total))
            #test_accc += accuracy_score(torch.argmax(outputs.Data,dim=1),label)
            #



if __name__ == '__main__':
    train_csv = './data/train.csv'
    test_csv = './data/test.csv'
    word2vec_dir = './data/glove.6B.300d.txt'
    model_dir = './model/Fasttext_model.pkl'
    sentence_max_size = 50
    #每一篇文章的最大詞數
    batch_size = 64
    epoch = 2
    embedding_dim = 300
    learning_rate = 0.001
    hidden_size = 200
    label_size = 4
    train_iter,test_iter,vocab = get_data_iter(train_csv,test_csv,sentence_max_size)
    #定義模型
    model = Fasttext(vocab = vocab,vec_dim = embedding_dim,label_size = label_size,hidden_size=hidden_size)
    print('------Begin training------')
    Strain_model(model,train_iter,epoch,learning_rate,batch_size)
    torch.save(model,model_dir)
    print('------Begin test model------')
    model_test(model,test_iter)
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章