文本分類(2)-基於傳統機器學習方法進行文本分類

傳統機器學習的文本分類通常提取TFIDF或者詞袋特徵,然後給模型進行訓練,傳統的機器學習的分類模型由很多,比如邏輯迴歸、支持向量機、多層感知機、貝葉斯等等。利用傳統機器學習方法進行文本分類的基本思路:獲取數據、數據預處理(上一篇博客已經講過了https://blog.csdn.net/weixin_44766179/article/details/89855100)、特徵提取、模型訓練、預測。
下面利用傳統機器學習方法實現垃圾郵件分類任務。

import numpy as np
import pandas as pd
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
import warnings

warnings.filterwarnings('ignore')

data_file = './spam.csv'

df = pd.read_csv(data_file, encoding='latin1')

labels = df.v1
texts = df.v2

def clear_data(text):
    # 英文縮寫替換
    text_abbreviation = []
    for item in text:
        item = item.lower().replace("it's", "it is").replace("i'm", "i am").replace("he's", "he is").replace("she's", "she is")\
        .replace("we're", "we are").replace("they're", "they are").replace("you're", "you are").replace("that's", "that is")\
        .replace("this's", "this is").replace("can't", "can not").replace("don't", "do not").replace("doesn't", "does not")\
        .replace("we've", "we have").replace("i've", " i have").replace("isn't", "is not").replace("won't", "will not")\
        .replace("hasn't", "has not").replace("wasn't", "was not").replace("weren't", "were not").replace("let's", "let us")
        
        text_abbreviation.append(item)
        
    # 刪除標點符號、數字等其他字符
    text_clear_str = []
    for item in text_abbreviation:
        item = re.sub("[^a-zA-Z]", " ", item)
        text_clear_str.append(' '.join(item.split()))
        
    texts = []
    stem_porter = PorterStemmer()  # 詞形歸一化
    stop_words = stopwords.words("english")  # 停用詞

    # 分詞、詞形歸一化、刪除停用詞
    for item in text_clear_str:
        words_token = word_tokenize(item)  # 分詞
        words = [stem_porter.stem(w) for w in words_token if w not in stop_words]
        texts.append(' '.join(words))
        
    return texts

texts = clear_data(texts)

le = LabelEncoder()
labels = le.fit_transform(labels)
# TFIDF特徵提取
def features_extraction(text):
    vector = TfidfVectorizer()
    return vector.fit_transform(text).todense()

features = features_extraction(texts)

x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=2)
# 邏輯迴歸
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression().fit(x_train, y_train)

y_pred = lr.predict(x_test)

print('accuracy_lr : ', accuracy_score(y_test, y_pred))  # 輸出:0.9524663677130045
# 支持向量機
from sklearn.svm import SVC

svc = SVC(kernel='linear').fit(x_train, y_train)

y_pred = svc.predict(x_test)

print('accuracy_svm: ', accuracy_score(y_test, y_pred))  # 輸出:0.9739910313901345
# 多層感知機
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(100, 100,)).fit(x_train, y_train)

y_pred = mlp.predict(x_test)

print('accuracy_mlp: ', accuracy_score(y_test, y_pred))  #   輸出:0.9748878923766816
# 貝葉斯
from sklearn.naive_bayes import MultinomialNB

mb = MultinomialNB().fit(x_train, y_train)

y_pred = mb.predict(x_test)

print('accuracy_mb: ', accuracy_score(y_test, y_pred))  # 輸出:0.9623318385650225
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章