使用CountVectorizer和TfidfVectorizer對fetch_20newsgroups數據進行分類,並對是否使用停用詞進行對比(精確度)

from sklearn.datasets import fetch_20newsgroups
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer


#1.使用爲去掉停用詞的DictVectorizer對20newsgroup進行分類
news = fetch_20newsgroups(subset='all') 
#對數據進行分割
X_train,X_test,y_train,y_test = train_test_split(news.data,news.target,test_size=0.25,random_state=33)
#採用默認配置對CountVectorizer進行初始化
count_vec = CountVectorizer()
#使用詞頻統計的方式將原始數據和測試文本轉化爲特徵向量
X_count_train = count_vec.fit_transform(X_train)
X_count_test = count_vec.transform(X_test)
#使用默認的配置對分類器進行初始化(樸素貝葉斯分類器)
mnb_count=MultinomialNB()
#使用分類器對不去停用詞的訓練樣本的參數進行學習
mnb_count.fit(X_count_train, y_train)
print('The accuracy of classifying 20newsgroup using Navie Bayes CountVectorizer without filtering stopwords:',mnb_count.score(X_count_test,y_test))
#將分類預測的結果存儲在變量y_count_predict
y_count_predict = mnb_count.predict(X_count_test)
print(classification_report(y_test, y_count_predict,target_names=news.target_names))


#2.使用tTfidf_vectorizer並且不去掉停用詞的情況下對20newsgroup進行分類
#使用默認配置對TfidfVectorizer進行初始化
tfidf_vec = TfidfVectorizer()
#使用tfidf的方式,將原始訓練和測試文本轉化爲特徵向量
X_tfidf_train = tfidf_vec.fit_transform(X_train)
X_tfidf_test = tfidf_vec.transform(X_test)
#使用默認的配置對分類器進行初始化(樸素貝葉斯分類器)
mnb_count=MultinomialNB()
#使用分類器對不去停用詞的訓練樣本的參數進行學習
mnb_count.fit(X_tfidf_train, y_train)
print('The accuracy of classifying 20newsgroup using Navie Bayes CountVectorizer without filtering stopwords:',mnb_count.score(X_tfidf_test,y_test))
#將分類預測的結果存儲在變量y_count_predict
y_tfidf_predict = mnb_count.predict(X_tfidf_test)
print(classification_report(y_test, y_tfidf_predict,target_names=news.target_names))


#3.使用CountVectorizer和TfidfVectorizer,並且去掉停用詞的情況下對文本的特徵進行量化的樸素貝葉斯分類的性能測試
count_filter_vec=CountVectorizer(analyzer='word',stop_words='english')
tfidf_filter_vec=TfidfVectorizer(analyzer='word',stop_words='english')
#使用帶有停用詞過濾器的CountVectorizer對訓練和測試文本進行量化處理
X_count_filter_train = count_filter_vec.fit_transform(X_train)
X_count_filter_test = count_filter_vec.transform((X_test))
#使用帶有停用詞的CountVectorizer對訓練和測試文本進行量化處理
X_tfidf_filter_train = tfidf_filter_vec.fit_transform(X_train)
X_tfidf_filter_test = tfidf_filter_vec.transform(X_test)
#初始化默認配置的樸素貝葉斯分類器,並對CountVectorizer後的數據進行預測與準確的評估
mnb_count_filter= MultinomialNB()
mnb_count_filter.fit(X_count_filter_train,y_train)
print('The accuracy of classifying 20newsgroups using Naive Bayes(CountVectorizer by filter stopwords):',mnb_count_filter.score(X_count_filter_test,y_test))
y_count_filter_predict = mnb_count_filter.predict(X_count_filter_test)
#初始化另一個默認配置的樸素貝葉斯分類器並對TfidfVectorizer後的數據機型預測和準確性進行評估
mnb_tfidf_filter=MultinomialNB()
mnb_tfidf_filter.fit(X_tfidf_filter_train,y_train)
print('The accuracy of classifying 2newsgroup with Naive Bays(TfidfVectorizer by filtering stopwords):',mnb_tfidf_filter.score(X_tfidf_filter_test,y_test))
y_tfidf_filter_predict = mnb_tfidf_filter.predict(X_tfidf_filter_test)
print(classification_report(y_test, y_count_filter_predict,target_names=news.target_names))
print(classification_report(y_test, y_tfidf_filter_predict,target_names=news.target_names))
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章