


import collections
import os
import random
import time
from tqdm import tqdm
import torch
from torch import nn
import torchtext.vocab as Vocab
import torch.utils.data as Data
import torch.nn.functional as F
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def read_imdb(folder='train', data_root="/home/kesci/input/IMDB2578/aclImdb_v1/aclImdb"):
    data = []
    for label in ['pos', 'neg']:
        folder_name = os.path.join(data_root, folder, label)
        for file in tqdm(os.listdir(folder_name)):
            with open(os.path.join(folder_name, file), 'rb') as f:
                review = f.read().decode('utf-8').replace('\n', '').lower()
                data.append([review, 1 if label == 'pos' else 0])
    return data

DATA_ROOT = "/home/kesci/input/IMDB2578/aclImdb_v1/"
data_root = os.path.join(DATA_ROOT, "aclImdb")
train_data, test_data = read_imdb('train', data_root), read_imdb('test', data_root)

# 打印訓練數據中的前五個sample
for sample in train_data[:5]:
    print(sample[1], '\t', sample[0][:50])


def get_tokenized_imdb(data):
        data: 數據的列表,列表中的每個元素爲 [文本字符串,0/1標籤] 二元組
    @return: 切分詞後的文本的列表,列表中的每個元素爲切分後的詞序列
    def tokenizer(text):
        return [tok.lower() for tok in text.split(' ')]
    return [tokenizer(review) for review, _ in data]

def get_vocab_imdb(data):
        data: 同上
    @return: 數據集上的詞典,Vocab 的實例(freqs, stoi, itos)
    tokenized_data = get_tokenized_imdb(data)
    counter = collections.Counter([tk for st in tokenized_data for tk in st])
    return Vocab.Vocab(counter, min_freq=5)

vocab = get_vocab_imdb(train_data)
print('# words in vocab:', len(vocab))

def preprocess_imdb(data, vocab):
        data: 同上,原始的讀入數據
        vocab: 訓練集上生成的詞典
        features: 單詞下標序列,形狀爲 (n, max_l) 的整數張量
        labels: 情感標籤,形狀爲 (n,) 的0/1整數張量
    max_l = 500  # 將每條評論通過截斷或者補0,使得長度變成500

    def pad(x):
        return x[:max_l] if len(x) > max_l else x + [0] * (max_l - len(x))

    tokenized_data = get_tokenized_imdb(data)
    features = torch.tensor([pad([vocab.stoi[word] for word in words]) for words in tokenized_data])
    labels = torch.tensor([score for _, score in data])
    return features, labels


train_set = Data.TensorDataset(*preprocess_imdb(train_data, vocab))
test_set = Data.TensorDataset(*preprocess_imdb(test_data, vocab))

# 上面的代碼等價於下面的註釋代碼
# train_features, train_labels = preprocess_imdb(train_data, vocab)
# test_features, test_labels = preprocess_imdb(test_data, vocab)
# train_set = Data.TensorDataset(train_features, train_labels)
# test_set = Data.TensorDataset(test_features, test_labels)

# len(train_set) = features.shape[0] or labels.shape[0]
# train_set[index] = (features[index], labels[index])

batch_size = 64
train_iter = Data.DataLoader(train_set, batch_size, shuffle=True)
test_iter = Data.DataLoader(test_set, batch_size)

for X, y in train_iter:
    print('X', X.shape, 'y', y.shape)
print('#batches:', len(train_iter))



class BiRNN(nn.Module):
    def __init__(self, vocab, embed_size, num_hiddens, num_layers):
            vocab: 在數據集上創建的詞典,用於獲取詞典大小
            embed_size: 嵌入維度大小
            num_hiddens: 隱藏狀態維度大小
            num_layers: 隱藏層個數
        super(BiRNN, self).__init__()
        self.embedding = nn.Embedding(len(vocab), embed_size)
        # encoder-decoder framework
        # bidirectional設爲True即得到雙向循環神經網絡
        self.encoder = nn.LSTM(input_size=embed_size, 
        self.decoder = nn.Linear(4*num_hiddens, 2) # 初始時間步和最終時間步的隱藏狀態作爲全連接層輸入
    def forward(self, inputs):
            inputs: 詞語下標序列,形狀爲 (batch_size, seq_len) 的整數張量
            outs: 對文本情感的預測,形狀爲 (batch_size, 2) 的張量
        # 因爲LSTM需要將序列長度(seq_len)作爲第一維,所以需要將輸入轉置
        embeddings = self.embedding(inputs.permute(1, 0)) # (seq_len, batch_size, d)
        # rnn.LSTM 返回輸出、隱藏狀態和記憶單元,格式如 outputs, (h, c)
        outputs, _ = self.encoder(embeddings) # (seq_len, batch_size, 2*h)
        encoding = torch.cat((outputs[0], outputs[-1]), -1) # (batch_size, 4*h)
        outs = self.decoder(encoding) # (batch_size, 2)
        return outs

embed_size, num_hiddens, num_layers = 100, 100, 2
net = BiRNN(vocab, embed_size, num_hiddens, num_layers)


cache_dir = "/home/kesci/input/GloVe6B5429"
glove_vocab = Vocab.GloVe(name='6B', dim=100, cache=cache_dir)

def load_pretrained_embedding(words, pretrained_vocab):
        words: 需要加載詞向量的詞語列表,以 itos (index to string) 的詞典形式給出
        pretrained_vocab: 預訓練詞向量
        embed: 加載到的詞向量
    embed = torch.zeros(len(words), pretrained_vocab.vectors[0].shape[0]) # 初始化爲0
    oov_count = 0 # out of vocabulary
    for i, word in enumerate(words):
            idx = pretrained_vocab.stoi[word]
            embed[i, :] = pretrained_vocab.vectors[idx]
        except KeyError:
            oov_count += 1
    if oov_count > 0:
        print("There are %d oov words." % oov_count)
    return embed

net.embedding.weight.data.copy_(load_pretrained_embedding(vocab.itos, glove_vocab))
net.embedding.weight.requires_grad = False # 直接加載預訓練好的, 所以不需要更新它


def evaluate_accuracy(data_iter, net, device=None):
    if device is None and isinstance(net, torch.nn.Module):
        device = list(net.parameters())[0].device 
    acc_sum, n = 0.0, 0
    with torch.no_grad():
        for X, y in data_iter:
            if isinstance(net, torch.nn.Module):
                acc_sum += (net(X.to(device)).argmax(dim=1) == y.to(device)).float().sum().cpu().item()
                if('is_training' in net.__code__.co_varnames):
                    acc_sum += (net(X, is_training=False).argmax(dim=1) == y).float().sum().item() 
                    acc_sum += (net(X).argmax(dim=1) == y).float().sum().item() 
            n += y.shape[0]
    return acc_sum / n

def train(train_iter, test_iter, net, loss, optimizer, device, num_epochs):
    net = net.to(device)
    print("training on ", device)
    batch_count = 0
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n, start = 0.0, 0.0, 0, time.time()
        for X, y in train_iter:
            X = X.to(device)
            y = y.to(device)
            y_hat = net(X)
            l = loss(y_hat, y) 
            train_l_sum += l.cpu().item()
            train_acc_sum += (y_hat.argmax(dim=1) == y).sum().cpu().item()
            n += y.shape[0]
            batch_count += 1
        test_acc = evaluate_accuracy(test_iter, net)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'
              % (epoch + 1, train_l_sum / batch_count, train_acc_sum / n, test_acc, time.time() - start))
lr, num_epochs = 0.01, 5
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=lr)
loss = nn.CrossEntropyLoss()

train(train_iter, test_iter, net, loss, optimizer, device, num_epochs)


def predict_sentiment(net, vocab, sentence):
        net: 訓練好的模型
        vocab: 在該數據集上創建的詞典,用於將給定的單詞序轉換爲單詞下標的序列,從而輸入模型
        sentence: 需要分析情感的文本,以單詞序列的形式給出
    @return: 預測的結果,positive 爲正面情緒文本,negative 爲負面情緒文本
    device = list(net.parameters())[0].device # 讀取模型所在的環境
    sentence = torch.tensor([vocab.stoi[word] for word in sentence], device=device)
    label = torch.argmax(net(sentence.view((1, -1))), dim=1)
    return 'positive' if label.item() == 1 else 'negative'

predict_sentiment(net, vocab, ['this', 'movie', 'is', 'so', 'great'])


def corr1d(X, K):
        X: 輸入,形狀爲 (seq_len,) 的張量
        K: 卷積核,形狀爲 (w,) 的張量
        Y: 輸出,形狀爲 (seq_len - w + 1,) 的張量
    w = K.shape[0] # 卷積窗口寬度
    Y = torch.zeros((X.shape[0] - w + 1))
    for i in range(Y.shape[0]): # 滑動窗口
        Y[i] = (X[i: i + w] * K).sum()
    return Y

X, K = torch.tensor([0, 1, 2, 3, 4, 5, 6]), torch.tensor([1, 2])
print(corr1d(X, K))

def corr1d_multi_in(X, K):
    # 首先沿着X和K的通道維遍歷並計算一維互相關結果。然後將所有結果堆疊起來沿第0維累加
    return torch.stack([corr1d(x, k) for x, k in zip(X, K)]).sum(dim=0)
    # [corr1d(X[i], K[i]) for i in range(X.shape[0])]

X = torch.tensor([[0, 1, 2, 3, 4, 5, 6],
              [1, 2, 3, 4, 5, 6, 7],
              [2, 3, 4, 5, 6, 7, 8]])
K = torch.tensor([[1, 2], [3, 4], [-1, -3]])
print(corr1d_multi_in(X, K))


class GlobalMaxPool1d(nn.Module):
    def __init__(self):
        super(GlobalMaxPool1d, self).__init__()
    def forward(self, x):
            x: 輸入,形狀爲 (batch_size, n_channels, seq_len) 的張量
        @return: 時序最大池化後的結果,形狀爲 (batch_size, n_channels, 1) 的張量
        return F.max_pool1d(x, kernel_size=x.shape[2]) # kenerl_size=seq_len



class TextCNN(nn.Module):
    def __init__(self, vocab, embed_size, kernel_sizes, num_channels):
            vocab: 在數據集上創建的詞典,用於獲取詞典大小
            embed_size: 嵌入維度大小
            kernel_sizes: 卷積核大小列表
            num_channels: 卷積通道數列表
        super(TextCNN, self).__init__()
        self.embedding = nn.Embedding(len(vocab), embed_size) # 參與訓練的嵌入層
        self.constant_embedding = nn.Embedding(len(vocab), embed_size) # 不參與訓練的嵌入層
        self.pool = GlobalMaxPool1d() # 時序最大池化層沒有權重,所以可以共用一個實例
        self.convs = nn.ModuleList()  # 創建多個一維卷積層
        for c, k in zip(num_channels, kernel_sizes):
            self.convs.append(nn.Conv1d(in_channels = 2*embed_size, 
                                        out_channels = c, 
                                        kernel_size = k))
        self.decoder = nn.Linear(sum(num_channels), 2)
        self.dropout = nn.Dropout(0.5) # 丟棄層用於防止過擬合

    def forward(self, inputs):
            inputs: 詞語下標序列,形狀爲 (batch_size, seq_len) 的整數張量
            outputs: 對文本情感的預測,形狀爲 (batch_size, 2) 的張量
        embeddings = torch.cat((
            self.constant_embedding(inputs)), dim=2) # (batch_size, seq_len, 2*embed_size)
        # 根據一維卷積層要求的輸入格式,需要將張量進行轉置
        embeddings = embeddings.permute(0, 2, 1) # (batch_size, 2*embed_size, seq_len)
        encoding = torch.cat([
            self.pool(F.relu(conv(embeddings))).squeeze(-1) for conv in self.convs], dim=1)
        # encoding = []
        # for conv in self.convs:
        #     out = conv(embeddings) # (batch_size, out_channels, seq_len-kernel_size+1)
        #     out = self.pool(F.relu(out)) # (batch_size, out_channels, 1)
        #     encoding.append(out.squeeze(-1)) # (batch_size, out_channels)
        # encoding = torch.cat(encoding) # (batch_size, out_channels_sum)
        # 應用丟棄法後使用全連接層得到輸出
        outputs = self.decoder(self.dropout(encoding))
        return outputs

embed_size, kernel_sizes, nums_channels = 100, [3, 4, 5], [100, 100, 100]
net = TextCNN(vocab, embed_size, kernel_sizes, nums_channels)


lr, num_epochs = 0.001, 5
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=lr)
loss = nn.CrossEntropyLoss()
train(train_iter, test_iter, net, loss, optimizer, device, num_epochs)
