任務內容:
給定一個文本庫,比如說新聞文本(無標註的)等等,現在有一些已經做好標註的文本,如何在文本庫中找到與做好標註的文本相似的文章。
所用工具:
參考來源:
處理流程圖:
項目源碼:
# -*- coding: utf-8 -*- """ Created on Wed Jan 11 14:01:04 2017 @author: kelvin-li """ #將9722篇文檔讀入,並以列表方式存儲 doc = [] f = open("C:\Users\kelvin-li\*****.txt") for line in f.readlines(): f_split = line.split(':::') path = f_split[6] docpath = path.replace('C:\Users\***','C:\Users\kelvin-li\***') docpath = docpath.replace('\n','') #去除路徑最後的換行符 doc_file = open(docpath) doc.append(doc_file.read()) doc_file.close() f.close() import logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) from gensim import corpora,models,similarities #小寫化文本,但是標點符號和單詞沒有分離 texts_lower = [[word for word in document.lower().split()] for document in doc] from nltk.tokenize import word_tokenize #對於每個文檔,先進行utf-8解碼,然後進行tokenize,再對每個單詞小寫化 texts_tokenized = [[word.lower() for word in word_tokenize(document.decode('utf-8'))] for document in doc] #去停用詞,用nltk帶有的停用詞表 from nltk.corpus import stopwords english_stopwords = stopwords.words('english') #過濾掉文檔中的停用詞 texts_filtered_stopwords = [[word for word in document if not word in english_stopwords] for document in texts_tokenized] #過濾完停用詞,但是標點符號沒有過濾 #定義一個標點符號的詞典,用這個詞典來過濾標點符號 english_punctuations = [',','.',':',';','?','!','(',')','[',']','@','&','#','%','$','{','}','--','-'] texts_filtered = [[word for word in document if not word in english_punctuations] for document in texts_filtered_stopwords] #接下來將這些英文單詞詞幹化,詞幹化可以提取不同語態及各種後綴的詞幹 #可以用nltk中的Lancaster Stemmer和 Poter Stemmer 工具 #對比發現Lancaster抽取時略去太多詞尾的e,所以選Poter from nltk.stem.porter import PorterStemmer st = PorterStemmer() #from nltk.stem.lancaster import LancasterStemmer #st = LancasterStemmer() texts_stemmed = [[st.stem(word) for word in document] for document in texts_filtered] ''' #去掉文檔語料庫中出現次數爲1的低頻詞 all_stems = sum(texts_stemmed,[]) stems_once = set(stem for stem in set(all_stems) if all_stems.count(stem) == 1) texts = [[stem for stem in text if stem not in stems_once] for text in texts_stemmed] ''' #建立一個字典,字典表示了這個詞以及這個詞在texts語料庫裏面出現的次數 dictionary = corpora.Dictionary(texts_stemmed) #把整個語料庫的文檔轉化爲(id,出現次數) corpus = [dictionary.doc2bow(text) for text in texts_stemmed] # define a 10-dimensional LSI space lsi = models.LsiModel(corpus,id2word=dictionary,num_topics=20) #transform corpus to lsi space and index it index = similarities.MatrixSimilarity(lsi[corpus]) #需要比較的文檔 ''' #隨機選取5個文件來和其他比較 import random text_num = random.sample(xrange(10),5) ''' #選取指定的1000個文檔比較 text_num = [] num_file = open("C:\Users\kelvin-li\Desktop\MMSED-Text\Text_Index1_10.txt") for line in num_file.readlines(): line = line.strip('\n') text_num.append(line) num_file.close() #將列表中字符串改成數值,迭代列表元素處理後組成新的列表。 text_num = [int(i)-1 for i in text_num] sims = [] for count in text_num: compare_text = dictionary.doc2bow(texts_stemmed[count]) sims.append(index[lsi[compare_text]]) #perform a similarity query against the corpus ''' # 前10篇文檔的相互比較 count = 0 texts_lsi = [] sims = [] while (count < 10): compare_text = dictionary.doc2bow(texts_stemmed[count]) texts_lsi.append( lsi[compare_text] ) sims.append(index[texts_lsi[count]]) #perform a similarity query against the corpus count+=1 ''' # 利用numpy來處理相似矩陣 import numpy as np sim_matrix = np.array(sims).transpose() np.savetxt('***.txt',sim_matrix, delimiter = ',' , fmt = '%10.8f') #讀入數據 #t = np.loadtxt("C:\Users\kelvin-li\***.txt",delimiter = ',')