傳統機器學習的文本分類通常提取TFIDF或者詞袋特徵,然後給模型進行訓練,傳統的機器學習的分類模型由很多,比如邏輯迴歸、支持向量機、多層感知機、貝葉斯等等。利用傳統機器學習方法進行文本分類的基本思路:獲取數據、數據預處理(上一篇博客已經講過了https://blog.csdn.net/weixin_44766179/article/details/89855100)、特徵提取、模型訓練、預測。
下面利用傳統機器學習方法實現垃圾郵件分類任務。
import numpy as np
import pandas as pd
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
import warnings
warnings.filterwarnings('ignore')
data_file = './spam.csv'
df = pd.read_csv(data_file, encoding='latin1')
labels = df.v1
texts = df.v2
def clear_data(text):
# 英文縮寫替換
text_abbreviation = []
for item in text:
item = item.lower().replace("it's", "it is").replace("i'm", "i am").replace("he's", "he is").replace("she's", "she is")\
.replace("we're", "we are").replace("they're", "they are").replace("you're", "you are").replace("that's", "that is")\
.replace("this's", "this is").replace("can't", "can not").replace("don't", "do not").replace("doesn't", "does not")\
.replace("we've", "we have").replace("i've", " i have").replace("isn't", "is not").replace("won't", "will not")\
.replace("hasn't", "has not").replace("wasn't", "was not").replace("weren't", "were not").replace("let's", "let us")
text_abbreviation.append(item)
# 刪除標點符號、數字等其他字符
text_clear_str = []
for item in text_abbreviation:
item = re.sub("[^a-zA-Z]", " ", item)
text_clear_str.append(' '.join(item.split()))
texts = []
stem_porter = PorterStemmer() # 詞形歸一化
stop_words = stopwords.words("english") # 停用詞
# 分詞、詞形歸一化、刪除停用詞
for item in text_clear_str:
words_token = word_tokenize(item) # 分詞
words = [stem_porter.stem(w) for w in words_token if w not in stop_words]
texts.append(' '.join(words))
return texts
texts = clear_data(texts)
le = LabelEncoder()
labels = le.fit_transform(labels)
# TFIDF特徵提取
def features_extraction(text):
vector = TfidfVectorizer()
return vector.fit_transform(text).todense()
features = features_extraction(texts)
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=2)
# 邏輯迴歸
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression().fit(x_train, y_train)
y_pred = lr.predict(x_test)
print('accuracy_lr : ', accuracy_score(y_test, y_pred)) # 輸出:0.9524663677130045
# 支持向量機
from sklearn.svm import SVC
svc = SVC(kernel='linear').fit(x_train, y_train)
y_pred = svc.predict(x_test)
print('accuracy_svm: ', accuracy_score(y_test, y_pred)) # 輸出:0.9739910313901345
# 多層感知機
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(100, 100,)).fit(x_train, y_train)
y_pred = mlp.predict(x_test)
print('accuracy_mlp: ', accuracy_score(y_test, y_pred)) # 輸出:0.9748878923766816
# 貝葉斯
from sklearn.naive_bayes import MultinomialNB
mb = MultinomialNB().fit(x_train, y_train)
y_pred = mb.predict(x_test)
print('accuracy_mb: ', accuracy_score(y_test, y_pred)) # 輸出:0.9623318385650225