代碼
import spacy
from itertools import combinations
nlp = spacy.load("zh_core_web_lg")
# nlp = spacy.load("en_core_web_md")
# 1、預處理數據列表,去重數據行中的停用詞,中文如‘的’,英文如'of',再重組返回數據列表
# 2、注意spacy.load使用英文庫時可以使用lemmatized_tokens.append(token.lemma_),中文庫時使用lemmatized_tokens.append(token.text)
def pre_process(titles):
"""
Pre-processes titles by removing stopwords and lemmatizing text.
:param titles: list of strings, contains target titles,.
:return: preprocessed_title_docs, list containing pre-processed titles.
"""
# Preprocess all the titles
title_docs = [nlp(x) for x in titles]
preprocessed_title_docs = []
lemmatized_tokens = []
for title_doc in title_docs:
for token in title_doc:
is_stop = token.is_stop
text = token.text
lemma_ = token.lemma_
if not is_stop:
lemmatized_tokens.append(token.text)
preprocessed_title_docs.append(" ".join(lemmatized_tokens))
del lemmatized_tokens[:] # empty the lemmatized tokens list as the code moves onto a new title
return titles
# 1、打散數據列表重組成兩兩一對的大列表,數據發散成n*n,
# 2、使用spacy的餘弦相似度函數過濾出兩兩小列表中相似分符合指定分值的,本文中指定分值是similarity > 0.8
# 3、遞歸過濾剔除數據列表重複值
def similarity_filter(titles):
"""
Recursively check if titles pass a similarity filter.
:param titles: list of strings, contains titles.
If the function finds titles that fail the similarity test, the above param will be the function output.
:return: this method upon itself unless there are no similar titles; in that case the feed that was passed
in is returned.
"""
# Preprocess titles
preprocessed_title_docs = pre_process(titles)
# Remove similar titles
all_summary_pairs = list(combinations(preprocessed_title_docs, 2))
similar_titles = []
for pair in all_summary_pairs:
title1 = nlp(pair[0])
title2 = nlp(pair[1])
similarity = title1.similarity(title2)
if similarity > 0.8:
similar_titles.append(pair)
titles_to_remove = []
for a_title in similar_titles:
# Get the index of the first title in the pair
index_for_removal = preprocessed_title_docs.index(a_title[0])
titles_to_remove.append(index_for_removal)
# Get indices of similar titles and remove them
similar_title_counts = set(titles_to_remove)
similar_titles = [
x[1] for x in enumerate(titles) if x[0] in similar_title_counts
]
# Exit the recursion if there are no longer any similar titles
if len(similar_title_counts) == 0:
return titles
# Continue the recursion if there are still titles to remove
else:
# Remove similar titles from the next input
for title in similar_titles:
idx = titles.index(title)
titles.pop(idx)
return similarity_filter(titles)
if __name__ == '__main__':
titles_ = [
"我愛中國人民",
"我愛中國人民",
"我愛中國人民",
"我愛中國人民",
"我愛中國人民",
"我愛中國人民",
"我愛中國人民",
"我愛中國人民",
"我愛中國人民",
"我愛中國人民",
"我愛中國人民",
"2020年家庭",
"1月銷售預測",
"賬戶餘額2017-2018",
"銷售預測1月份",
"年終晚會回顧-2020",
]
res = similarity_filter(titles_)
print(res)
# console output ['我愛中國人民', '2020年家庭', '賬戶餘額2017-2018', '銷售預測1月份', '年終晚會回顧-2020']
spacy安裝教程