word2vec
PTB數據集
import collections
import math
import random
import sys
import time
import os
import numpy as np
import torch
from torch import nn
import torch.utils.data as Data
with open('/home/kesci/input/ptb_train1020/ptb.train.txt', 'r') as f:
lines = f.readlines() # 該數據集中句子以換行符爲分割
raw_dataset = [st.split() for st in lines] # st是sentence的縮寫,單詞以空格爲分割
print('# sentences: %d' % len(raw_dataset))
# 對於數據集的前3個句子,打印每個句子的詞數和前5個詞
# 句尾符爲 '' ,生僻詞全用 '' 表示,數字則被替換成了 'N'
for st in raw_dataset[:3]:
print('# tokens:', len(st), st[:5])
索引建立
counter = collections.Counter([tk for st in raw_dataset for tk in st]) # tk是token的縮寫
counter = dict(filter(lambda x: x[1] >= 5, counter.items())) # 只保留在數據集中至少出現5次的詞
idx_to_token = [tk for tk, _ in counter.items()]
token_to_idx = {tk: idx for idx, tk in enumerate(idx_to_token)}
dataset = [[token_to_idx[tk] for tk in st if tk in token_to_idx]
for st in raw_dataset] # raw_dataset中的單詞在這一步被轉換爲對應的idx
num_tokens = sum([len(st) for st in dataset])
'# tokens: %d' % num_tokens
二次採樣
def discard(idx):
'''
@params:
idx: 單詞的下標
@return: True/False 表示是否丟棄該單詞
'''
return random.uniform(0, 1) < 1 - math.sqrt(
1e-4 / counter[idx_to_token[idx]] * num_tokens)
subsampled_dataset = [[tk for tk in st if not discard(tk)] for st in dataset]
print('# tokens: %d' % sum([len(st) for st in subsampled_dataset]))
def compare_counts(token):
return '# %s: before=%d, after=%d' % (token, sum(
[st.count(token_to_idx[token]) for st in dataset]), sum(
[st.count(token_to_idx[token]) for st in subsampled_dataset]))
print(compare_counts('the'))
print(compare_counts('join'))
中心詞和背景詞提取
def get_centers_and_contexts(dataset, max_window_size):
'''
@params:
dataset: 數據集爲句子的集合,每個句子則爲單詞的集合,此時單詞已經被轉換爲相應數字下標
max_window_size: 背景詞的詞窗大小的最大值
@return:
centers: 中心詞的集合
contexts: 背景詞窗的集合,與中心詞對應,每個背景詞窗則爲背景詞的集合
'''
centers, contexts = [], []
for st in dataset:
if len(st) < 2: # 每個句子至少要有2個詞纔可能組成一對“中心詞-背景詞”
continue
centers += st
for center_i in range(len(st)):
window_size = random.randint(1, max_window_size) # 隨機選取背景詞窗大小
indices = list(range(max(0, center_i - window_size),
min(len(st), center_i + 1 + window_size)))
indices.remove(center_i) # 將中心詞排除在背景詞之外
contexts.append([st[idx] for idx in indices])
return centers, contexts
all_centers, all_contexts = get_centers_and_contexts(subsampled_dataset, 5)
tiny_dataset = [list(range(7)), list(range(7, 10))]
print('dataset', tiny_dataset)
for center, context in zip(*get_centers_and_contexts(tiny_dataset, 2)):
print('center', center, 'has contexts', context)
Skip-Gram模型
embed = nn.Embedding(num_embeddings=10, embedding_dim=4)
print(embed.weight)
x = torch.tensor([[1, 2, 3], [4, 5, 6]], dtype=torch.long)
print(embed(x))
X = torch.ones((2, 1, 4))
Y = torch.ones((2, 4, 6))
print(torch.bmm(X, Y).shape)
def skip_gram(center, contexts_and_negatives, embed_v, embed_u):
'''
@params:
center: 中心詞下標,形狀爲 (n, 1) 的整數張量
contexts_and_negatives: 背景詞和噪音詞下標,形狀爲 (n, m) 的整數張量
embed_v: 中心詞的 embedding 層
embed_u: 背景詞的 embedding 層
@return:
pred: 中心詞與背景詞(或噪音詞)的內積,之後可用於計算概率 p(w_o|w_c)
'''
v = embed_v(center) # shape of (n, 1, d)
u = embed_u(contexts_and_negatives) # shape of (n, m, d)
pred = torch.bmm(v, u.permute(0, 2, 1)) # bmm((n, 1, d), (n, d, m)) => shape of (n, 1, m)
return pred
負採樣近似
def get_negatives(all_contexts, sampling_weights, K):
'''
@params:
all_contexts: [[w_o1, w_o2, ...], [...], ... ]
sampling_weights: 每個單詞的噪聲詞采樣概率
K: 隨機採樣個數
@return:
all_negatives: [[w_n1, w_n2, ...], [...], ...]
'''
all_negatives, neg_candidates, i = [], [], 0
population = list(range(len(sampling_weights)))
for contexts in all_contexts:
negatives = []
while len(negatives) < len(contexts) * K:
if i == len(neg_candidates):
# 根據每個詞的權重(sampling_weights)隨機生成k個詞的索引作爲噪聲詞。
# 爲了高效計算,可以將k設得稍大一點
i, neg_candidates = 0, random.choices(
population, sampling_weights, k=int(1e5))
neg, i = neg_candidates[i], i + 1
# 噪聲詞不能是背景詞
if neg not in set(contexts):
negatives.append(neg)
all_negatives.append(negatives)
return all_negatives
sampling_weights = [counter[w]**0.75 for w in idx_to_token]
all_negatives = get_negatives(all_contexts, sampling_weights, 5)
批量讀取數據
class MyDataset(torch.utils.data.Dataset):
def __init__(self, centers, contexts, negatives):
assert len(centers) == len(contexts) == len(negatives)
self.centers = centers
self.contexts = contexts
self.negatives = negatives
def __getitem__(self, index):
return (self.centers[index], self.contexts[index], self.negatives[index])
def __len__(self):
return len(self.centers)
def batchify(data):
'''
用作DataLoader的參數collate_fn
@params:
data: 長爲batch_size的列表,列表中的每個元素都是__getitem__得到的結果
@outputs:
batch: 批量化後得到 (centers, contexts_negatives, masks, labels) 元組
centers: 中心詞下標,形狀爲 (n, 1) 的整數張量
contexts_negatives: 背景詞和噪聲詞的下標,形狀爲 (n, m) 的整數張量
masks: 與補齊相對應的掩碼,形狀爲 (n, m) 的0/1整數張量
labels: 指示中心詞的標籤,形狀爲 (n, m) 的0/1整數張量
'''
max_len = max(len(c) + len(n) for _, c, n in data)
centers, contexts_negatives, masks, labels = [], [], [], []
for center, context, negative in data:
cur_len = len(context) + len(negative)
centers += [center]
contexts_negatives += [context + negative + [0] * (max_len - cur_len)]
masks += [[1] * cur_len + [0] * (max_len - cur_len)] # 使用掩碼變量mask來避免填充項對損失函數計算的影響
labels += [[1] * len(context) + [0] * (max_len - len(context))]
batch = (torch.tensor(centers).view(-1, 1), torch.tensor(contexts_negatives),
torch.tensor(masks), torch.tensor(labels))
return batch
batch_size = 512
num_workers = 0 if sys.platform.startswith('win32') else 4
dataset = MyDataset(all_centers, all_contexts, all_negatives)
data_iter = Data.DataLoader(dataset, batch_size, shuffle=True,
collate_fn=batchify,
num_workers=num_workers)
for batch in data_iter:
for name, data in zip(['centers', 'contexts_negatives', 'masks',
'labels'], batch):
print(name, 'shape:', data.shape)
break
訓練模型
class SigmoidBinaryCrossEntropyLoss(nn.Module):
def __init__(self):
super(SigmoidBinaryCrossEntropyLoss, self).__init__()
def forward(self, inputs, targets, mask=None):
'''
@params:
inputs: 經過sigmoid層後爲預測D=1的概率
targets: 0/1向量,1代表背景詞,0代表噪音詞
@return:
res: 平均到每個label的loss
'''
inputs, targets, mask = inputs.float(), targets.float(), mask.float()
res = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction="none", weight=mask)
res = res.sum(dim=1) / mask.float().sum(dim=1)
return res
loss = SigmoidBinaryCrossEntropyLoss()
pred = torch.tensor([[1.5, 0.3, -1, 2], [1.1, -0.6, 2.2, 0.4]])
label = torch.tensor([[1, 0, 0, 0], [1, 1, 0, 0]]) # 標籤變量label中的1和0分別代表背景詞和噪聲詞
mask = torch.tensor([[1, 1, 1, 1], [1, 1, 1, 0]]) # 掩碼變量
print(loss(pred, label, mask))
def sigmd(x):
return - math.log(1 / (1 + math.exp(-x)))
print('%.4f' % ((sigmd(1.5) + sigmd(-0.3) + sigmd(1) + sigmd(-2)) / 4)) # 注意1-sigmoid(x) = sigmoid(-x)
print('%.4f' % ((sigmd(1.1) + sigmd(-0.6) + sigmd(-2.2)) / 3))
embed_size = 100
net = nn.Sequential(nn.Embedding(num_embeddings=len(idx_to_token), embedding_dim=embed_size),
nn.Embedding(num_embeddings=len(idx_to_token), embedding_dim=embed_size))
def train(net, lr, num_epochs):
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("train on", device)
net = net.to(device)
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
for epoch in range(num_epochs):
start, l_sum, n = time.time(), 0.0, 0
for batch in data_iter:
center, context_negative, mask, label = [d.to(device) for d in batch]
pred = skip_gram(center, context_negative, net[0], net[1])
l = loss(pred.view(label.shape), label, mask).mean() # 一個batch的平均loss
optimizer.zero_grad()
l.backward()
optimizer.step()
l_sum += l.cpu().item()
n += 1
print('epoch %d, loss %.2f, time %.2fs'
% (epoch + 1, l_sum / n, time.time() - start))
train(net, 0.01, 5)
模型測試
def get_similar_tokens(query_token, k, embed):
'''
@params:
query_token: 給定的詞語
k: 近義詞的個數
embed: 預訓練詞向量
'''
W = embed.weight.data
x = W[token_to_idx[query_token]]
# 添加的1e-9是爲了數值穩定性
cos = torch.matmul(W, x) / (torch.sum(W * W, dim=1) * torch.sum(x * x) + 1e-9).sqrt()
_, topk = torch.topk(cos, k=k+1)
topk = topk.cpu().numpy()
for i in topk[1:]: # 除去輸入詞
print('cosine sim=%.3f: %s' % (cos[i], (idx_to_token[i])))
get_similar_tokens('chip', 3, net[0])