python同義詞替換(jieba分詞)
TihuanWords.txt文檔格式
注意:同一行的詞用單個空格隔開,每行第一個詞爲同行詞的替換詞。
年休假 年假 年休
究竟 到底
回家場景 我回來了
代碼
import jieba
def replaceSynonymWords(string1):
# 1讀取同義詞表,並生成一個字典。
combine_dict = {}
# synonymWords.txt是同義詞表,每行是一系列同義詞,用空格分割
for line in open("TihuanWords.txt", "r", encoding='utf-8'):
seperate_word = line.strip().split(" ")
num = len(seperate_word)
for i in range(1, num):
combine_dict[seperate_word[i]] = seperate_word[0]
print(seperate_word)
print(combine_dict)
# 2提升某些詞的詞頻,使其能夠被jieba識別出來
jieba.suggest_freq("年休假", tune=True)
# 3將語句切分成單詞
seg_list = jieba.cut(string1, cut_all=False)
f = "/".join(seg_list).encode("utf-8")
f = f.decode("utf-8")
print(f)
# 4返回同義詞替換後的句子
final_sentence = " "
for word in f.split('/'):
if word in combine_dict:
word = combine_dict[word]
final_sentence += word
else:
final_sentence += word
# print final_sentence
return final_sentence
string1 = '年休到底放幾天?'
print(replaceSynonymWords(string1))