鏈接:https://pan.baidu.com/s/17IhfiYTfo0UeLoir1h3kjA
提取碼:px2z
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
from torchtext import data
from torchtext.vocab import Vectors
from torchtext.data import Iterator,BucketIterator,TabularDataset
import os
import random
import logging
import numpy as np
import pandas as pd
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# 返回需要的Dataset用於構造迭代器
# 數據格式如下,每一條數據有三列,第一列爲標籤,第二列爲title,第三列爲content:
class MyDataset(data.Dataset):
def __init__(self, path, text_field, label_field, test=False, aug=False, **kwargs):
fields = [("label",label_field), ("title",None), ("text",text_field)]
examples = []
csv_data = pd.read_csv(path)
print('read data from {}'.format(path))
if test:
# 如果爲測試集,加載label用於驗證測試集準確率,否則在傳入參數時也需將label設爲None
for label,text in tqdm(zip(csv_data['label'], csv_data['text'])):
examples.append(data.Example.fromlist([label,None,text],fields))
#print(vars(examples[len(examples)-1]))
else:
for label,text in tqdm(zip(csv_data['label'], csv_data['text'])):
if aug:
rate = random.random()
if rate > 0.5:
text = self.dropout(text)
else:
text = self.shuffle(text)
# Example: Defines a single training or test example.Stores each column of the example as an attribute.
examples.append(data.Example.fromlist([label,None,text], fields))
# 之前是一些預處理操作,此處調用super調用父類構造方法,產生標準Dataset
# super(MyDataset, self).__init__(examples, fields, **kwargs)
super(MyDataset, self).__init__(examples, fields)
def shuffle(self, text):
#將數組的內容進行順序打亂
text = np.random.permutation(text.strip().split())
return ' '.join(text)
def dropout(self, text, p=0.5):
# random delete some text
text = text.strip().split()
len_ = len(text)
indexs = np.random.choice(len_, int(len_ * p))
for i in indexs:
text[i] = ''
return ' '.join(text)
class Fasttext(nn.Module):
'''
vocab:訓練集的字典
vec_dim:詞向量的維度
label_size:類別數量
hidden_size:隱藏層神經元的數量
'''
def __init__(self,vocab,vec_dim,label_size,hidden_size):
super(Fasttext,self).__init__()
#創建詞嵌套的向量
self.embedding = nn.Embedding(len(vocab),vec_dim)
#若使用預訓練的向量,需在此處指定預訓練的權重
self.embedding.weight.data.copy_(vocab.vectors)
self.embedding.weight.requires_grad = True
self.fc = nn.Sequential(
nn.Linear(vec_dim,hidden_size),
nn.BatchNorm1d(hidden_size),
#hidden_size表示期望的特徵數或者上一層的維數
nn.ReLU(inplace=True),
nn.Linear(hidden_size,label_size)
)
def forward(self,text):
text = self.embedding(text)
result = self.fc(torch.mean(text,dim = 1))
return result
def get_data_iter(train_csv,test_csv,fix_length):
Text = data.Field(sequential=True,lower=True,fix_length=fix_length,batch_first=True)
Label = data.Field(sequential=False,use_vocab=False)
#train_fields = [('label',Label),('title',None),('text',Text)]
#train = TabularDataset(path=train_csv,format='csv',fields=train_fields,skip_header=True)
#TabularDataset總是超出範圍
#預處理之後得數據集
train = MyDataset(path=train_csv, text_field=Text, label_field=Label, test=False, aug=True)
test = MyDataset(path=test_csv, text_field=Text, label_field=Label, test=True, aug=True)
vectors = Vectors(name=word2vec_dir)
Text.build_vocab(train, vectors=vectors)
vocab = Text.vocab
#test_fields = [('label',Label),('title',None),('text',Text)]
#test = TabularDataset(path=test_csv,format='csv',fields=test_fields,skip_header=True)
train_iter,test_iter = BucketIterator.splits(
(train,test),
batch_size=batch_size,
device=device,
sort_key=lambda x: len(x.text),
sort_within_batch=False,
repeat=False)
#train_iter = BucketIterator(train, batch_size=batch_size, device=device, sort_key=lambda x: len(x.text),sort_within_batch=False, repeat=False)
#test_iter = Iterator(test,batch_size=batch_size,device = device,sort = False,sort_within_batch = False,repeat=False)
#print((next(iter(train_iter)).text))
return train_iter,test_iter,vocab
def train_model(model,train_iter,epoch,lr,batch_size):
model.train()
#將模型設置爲訓練模式
optimizer = optim.Adam(model.parameters(),lr=lr)
criterion = nn.CrossEntropyLoss()
model.to(device)
criterion.to(device)
for i in range(epoch):
#多批次循環
for batch_idx,batch in enumerate(train_iter):
#注意target = batch.label-1,因爲數據集中的label是1,2,3,4,但是pytorch的label默認是從0開始的
data,target = batch.text,batch.label-1
optimizer.zero_grad() #清除所有優化的梯度
output = model(data) #傳入數據並前向傳播獲取輸出
loss = criterion(output,target)
loss.backward()
optimizer.step()
#打印狀態信息
if batch_idx%500 == 0 :
print('train_epoch:{},batch_id:{},loss:{}'.format((i),(batch_idx),(loss.item()/batch_size)))
print('Finished Training')
def model_test(model,test_iter):
model.eval()
model.to(device)
#將模型設置爲評估模式
correct = 0
total = 0
with torch.no_grad():
#i爲batch的下標
print((next(iter(test_iter)).text))
for i,batch in enumerate(test_iter):
data,label = batch.text,batch.label - 1
print('test_batch_id:{}'.format(i))
outputs = model(data)
_,predicted = torch.max(outputs.data,1)
#torch.max()[0]表示返回最大的值,torch.max()[1]表示返回最大值的每個索引
total += label.size(0)
correct += (predicted == label).sum().item()
print('Accuracy of the network on test set:{}'.format(100*correct/total))
#test_accc += accuracy_score(torch.argmax(outputs.Data,dim=1),label)
#
if __name__ == '__main__':
train_csv = './data/train.csv'
test_csv = './data/test.csv'
word2vec_dir = './data/glove.6B.300d.txt'
model_dir = './model/Fasttext_model.pkl'
sentence_max_size = 50
#每一篇文章的最大詞數
batch_size = 64
epoch = 2
embedding_dim = 300
learning_rate = 0.001
hidden_size = 200
label_size = 4
train_iter,test_iter,vocab = get_data_iter(train_csv,test_csv,sentence_max_size)
#定義模型
model = Fasttext(vocab = vocab,vec_dim = embedding_dim,label_size = label_size,hidden_size=hidden_size)
print('------Begin training------')
Strain_model(model,train_iter,epoch,learning_rate,batch_size)
torch.save(model,model_dir)
print('------Begin test model------')
model_test(model,test_iter)