詞嵌入進階
載入與訓練Glove向量
import torch
import torchtext.vocab as vocab
print([key for key in vocab.pretrained_aliases.keys() if "glove" in key])
cache_dir = "/home/kesci/input/GloVe6B5429"
glove = vocab.GloVe(name='6B', dim=50, cache=cache_dir)
print("一共包含%d個詞。" % len(glove.stoi))
print(glove.stoi['beautiful'], glove.itos[3366])
求近義詞和類比詞
def knn(W, x, k):
'''
@params:
W: 所有向量的集合
x: 給定向量
k: 查詢的數量
@outputs:
topk: 餘弦相似性最大k個的下標
[...]: 餘弦相似度
'''
cos = torch.matmul(W, x.view((-1,))) / (
(torch.sum(W * W, dim=1) + 1e-9).sqrt() * torch.sum(x * x).sqrt())
_, topk = torch.topk(cos, k=k)
topk = topk.cpu().numpy()
return topk, [cos[i].item() for i in topk]
def get_similar_tokens(query_token, k, embed):
'''
@params:
query_token: 給定的單詞
k: 所需近義詞的個數
embed: 預訓練詞向量
'''
topk, cos = knn(embed.vectors,
embed.vectors[embed.stoi[query_token]], k+1)
for i, c in zip(topk[1:], cos[1:]): # 除去輸入詞
print('cosine sim=%.3f: %s' % (c, (embed.itos[i])))
get_similar_tokens('chip', 3, glove)
求類比詞
def get_analogy(token_a, token_b, token_c, embed):
'''
@params:
token_a: 詞a
token_b: 詞b
token_c: 詞c
embed: 預訓練詞向量
@outputs:
res: 類比詞d
'''
vecs = [embed.vectors[embed.stoi[t]]
for t in [token_a, token_b, token_c]]
x = vecs[1] - vecs[0] + vecs[2]
topk, cos = knn(embed.vectors, x, 1)
res = embed.itos[topk[0]]
return res
get_analogy('man', 'woman', 'son', glove)```