-
使用餘弦相似度公式計算詞之間的相似性
-
使用詞嵌入解決類比推理問題
-
使用可視化算法t-SNE可視化嵌入矩陣
加載訓練好的Glove詞向量
def read_glove_vecs(glove_file):
with open(glove_file, 'r', encoding='utf-8') as f:
words = set()
word_to_vec_map = {}
for line in f:
line = line.strip().split()
curr_word = line[0]
words.add(curr_word)
word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
return words, word_to_vec_map
- words: 該語料庫中包含的單詞
- word_to_vec_map{dict}: 每個單詞對應的詞向量
編寫cosine_similarity 函數計算兩個詞的相似性
def cosine_similarity(u, v):
"""
Cosine similarity reflects the degree of similariy between u and v
Arguments:
u -- a word vector of shape (n,)
v -- a word vector of shape (n,)
Returns:
cosine_similarity -- the cosine similarity between u and v defined by the formula above.
"""
### START CODE HERE ###
# Compute the dot product between u and v (≈1 line)
dot = np.dot(u, v)
# Compute the L2 norm of u (≈1 line)
norm_u = np.sqrt(np.sum(u * u))
# Compute the L2 norm of v (≈1 line)
norm_v = np.sqrt(np.sum(v * v))
# Compute the cosine similarity defined by formula (1) (≈1 line)
cosine_similarity = dot / (norm_u * norm_v)
### END CODE HERE ###
return cosine_similarity
- cosine_similarity:相似性範圍在(-1~1)之間
編寫complete_analogy函數完成詞到詞之間的類比推理
def complete_analogy(word_a, word_b, word_c, word_to_vec_map):
"""
Performs the word analogy task as explained above: a is to b as c is to ____.
Arguments:
word_a -- a word, string
word_b -- a word, string
word_c -- a word, string
word_to_vec_map -- dictionary that maps words to their corresponding vectors.
Returns:
best_word -- the word such that v_b - v_a is close to v_best_word - v_c, as measured by cosine similarity
"""
# convert words to lower case
word_a, word_b, word_c = word_a.lower(), word_b.lower(), word_c.lower()
### START CODE HERE ###
# Get the word embeddings v_a, v_b and v_c (≈1-3 lines)
e_a, e_b, e_c = word_to_vec_map[word_a], word_to_vec_map[word_b], word_to_vec_map[word_c]
### END CODE HERE ###
words = word_to_vec_map.keys()
max_cosine_sim = -100 # Initialize max_cosine_sim to a large negative number
best_word = None # Initialize best_word with None, it will help keep track of the word to output
# loop over the whole word vector set
for w in words:
# to avoid best_word being one of the input words, pass on them.
if w in [word_a, word_b, word_c] :
continue
### START CODE HERE ###
# Compute cosine similarity between the vector (e_b - e_a) and the vector ((w's vector representation) - e_c) (≈1 line)
cosine_sim = cosine_similarity(e_b - e_a, word_to_vec_map[w] - e_c)
# If the cosine_sim is more than the max_cosine_sim seen so far,
# then: set the new max_cosine_sim to the current cosine_sim and the best_word to the current word (≈3 lines)
if cosine_sim > max_cosine_sim:
max_cosine_sim = cosine_sim
best_word = w
### END CODE HERE ###
return best_word
使用t-SNE算法可視化嵌入向量
from sklearn.manifold import TSNE
from matplotlib import pylab
%matplotlib inline
embeddings = np.zeros((200, 50))# 初始化嵌入向量
embeded_words = []
for index, word in enumerate(words):
if index < 200:# 可視化200個嵌入向量
embeddings[index] = word_to_vec_map[word]
embeded_words.append(word)
output = TSNE(n_components=2).fit_transform(embeddings)# 使用t-SNE算法降維
pylab.figure(figsize=(20, 20))
for i in range(output.shape[0]):
x, y = output[i]
pylab.scatter(x, y)
pylab.annotate(embeded_words[i], xy=(x, y), xytext=(5, 2), textcoords='offset points',
ha='right', va='bottom')
pylab.show()