1、簡介
最近一直在研究NLP的文本相似度算法,本文將利用TF-IDF特徵向量和Simhash指紋計算中文文本的相似度。
2、計算過程
- 準備測試數據
- 預處理讀到的數據
- 加載數據到Map中
- 輸入用戶問題
- 利用TF特徵向量和Simhash指紋計算出 預處理的配置文件中的分值
3、效果圖
4、核心代碼
try:
text = re_test.run(question) # 通過正則 查找匹配數據
doc_token = jt.tokens(text) # 預處理,分詞
doc_feat = fb.compute(doc_token)
doc_fl = DocFeatLoader(smb, doc_feat) # 對象包含兩個參數 # fingerprint 指紋分值 # feat_vec 包含元組的列表
# 預處理後的配置文件
contentFlListMap = nodeMap
p_score_list = []
if nodeId in contentFlListMap.keys():
nodeFlList = contentFlListMap[nodeId]
print("nodeFilist",nodeFlList)
for i in range(len(nodeFlList)):
p_score_dict={}
dist = cosine_distance_nonzero(nodeFlList[i]["lableDataFeatureVector"].feat_vec, doc_fl.feat_vec, norm=False)
p_score_dict["score"] = dist
p_score_dict["labelData"] = nodeFlList[i]["labelData"]
p_score_dict["targetNodeId"] = nodeFlList[i]["targetNodeId"]
p_score_dict["conditionId"] = nodeFlList[i]["conditionId"]
p_score_list.append(p_score_dict)
p_score_list = sorted(p_score_list, key=lambda score : score["score"], reverse=True)
print("Sorted:",p_score_list)
Complete_MayBeL4 = []
Complete_MayBeL4Score = []
Complete_MayBeL4ID = []
Complete_MayBeL4Max = 3
for i, el in enumerate(p_score_list):
p_label = p_score_list[i]["labelData"]
p_score = p_score_list[i]["score"]
p_conditionId = p_score_list[i]["conditionId"]
if len(Complete_MayBeL4) < Complete_MayBeL4Max:
Complete_MayBeL4.append(p_label)
Complete_MayBeL4Score.append(p_score)
Complete_MayBeL4ID.append(p_conditionId)
else:
break
print("************************************")
print("用戶問題:", question)
print("相似問(Max=%s):%s"%(Complete_MayBeL4Max,Complete_MayBeL4))
print("特徵值(Max=%s):%s"%(Complete_MayBeL4Max,Complete_MayBeL4Score))
print("ID:",Complete_MayBeL4ID)
return "", "", "", "", "", ""
except Exception as e:
print("************************************")
print("Error textSimilarity:", str(e))
print("************************************")
5、此項目Github源碼分享
https://github.com/ShaShiDiZhuanLan/Demo_TFIDF_Simhash_Python