from sklearn import datasets
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
# 垃圾郵件過濾
def spamTest_skl():
# 加載email文件夾下的數據
base_data = datasets.load_files('email/')
print(len(base_data))
# print(base_data.data)
print(len(base_data.target))
print(base_data.target)
# 交叉驗證選擇 訓練集和測試集
train_data, test_data, train_y, test_y = train_test_split(base_data.data, base_data.target,
test_size=0.2, train_size=0.8)
# 生成文本的詞頻矩陣
vectorizer = CountVectorizer(stop_words='english', decode_error='ignore')
wordX = vectorizer.fit_transform(train_data)
# 訓練分類器
clf = MultinomialNB().fit(wordX, train_y)
# 預測測試集的分類結果
test_wordX = vectorizer.transform(test_data).toarray()
# newDoc_tfidf = transformer.transform(newDoc_wordX) # 得到新文檔每個詞的TF-IDF值
predicted = clf.predict(test_wordX)
print(predicted)
# 在測試集上的性能評估
print(classification_report(test_y, predicted, target_names=base_data.target_names))
spamTest_skl()
輸出:
5
50
[1 0 0 1 0 1 1 1 0 0 1 1 1 1 0 0 0 1 1 1 0 1 1 0 1 0 1 0 0 1 0 0 1 1 0 0 1
0 0 0 1 0 0 0 1 1 0 0 1 1]
[0 1 1 1 1 0 1 0 0 0]
precision recall f1-score support
noSpam 1.00 1.00 1.00 5
spam 1.00 1.00 1.00 5
avg / total 1.00 1.00 1.00 10