文本情感分類問題
- 機器學習方法 TFIDF+機器學習分類算法
- 深度學習方法 TextCNN TextRNN 預訓練的模型
預訓練的模型有哪些?
- bert
輸入有三個序列 Token(字符的序列 把文本轉化成字符的編碼 進行輸入)
Segment(段序列 用於區分是句子A 還是句子B (如果是A就設爲0 B就設爲1) 用於文本分類 可以全部設成0)
Position(位置向量 由於transformer不能很好的捕捉位置特徵 引入位置向量 隨初始化 構建embedding的過程) - albert
- xlnet
- robert
預訓練的模型 需要很大的顯存
Bert源碼
- https://github.com/google-research/bert
- https://github.com/google-research/bert/blob/eedf5716ce1268e56f0a50264a88cafad334ac61/modeling.py#L428
transformer包
- https://huggingface.co/transformers/v2.5.0/model_doc/bert.html
- tokenizer.encode_plus 參數詳細見:https://github.com/huggingface/transformers/blob/72768b6b9c2083d9f2d075d80ef199a3eae881d8/src/transformers/tokenization_utils.py#L924 924行
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from tensorflow.python.keras.utils import to_categorical
from tqdm import tqdm
import tensorflow as tf
import tensorflow.keras.backend as K
import os
from transformers import *
print(tf.__version__)
# 文件讀取
TRAIN_PATH = './data/train_dataset/'
TEST_PATH = './data/test_dataset/'
BERT_PATH = './bert_base_chinese/'
MAX_SEQUENCE_LENGTH = 140
input_categories = '微博中文內容'
output_categories = '情感傾向'
df_train = pd.read_csv(TRAIN_PATH+'nCoV_100k_train.labled.csv',engine ='python')
df_train = df_train[df_train[output_categories].isin(['-1','0','1'])]
df_test = pd.read_csv(TEST_PATH+'nCov_10k_test.csv',engine ='python')
df_sub = pd.read_csv(TEST_PATH+'submit_example.csv')
print('train shape =', df_train.shape)
print('test shape =', df_test.shape)
tokenizer = BertTokenizer.from_pretrained(BERT_PATH+'bert-base-chinese-vocab.txt')
tokenizer.encode_plus("深度之眼",
add_special_tokens=True,
max_length=20,
truncation_strategy= 'longest_first')
# {'input_ids': [101, 3918, 2428, 722, 4706, 102], #Token序列
# 'token_type_ids': [0, 0, 0, 0, 0, 0], #Segment序列
# 'attention_mask': [1, 1, 1, 1, 1, 1]} #Position序列 會在bert裏面embedding
將微博的文本轉化成三個序列進行輸入
max_sequence_length 設置的固定的文本長度(取140)
def _convert_to_transformer_inputs(instance, tokenizer, max_sequence_length):
"""Converts tokenized input to ids, masks and segments for transformer (including bert)"""
"""默認返回input_ids,token_type_ids,attention_mask"""
# 使用tokenizer接口 將文本進行編碼 生成一個字典 字典中包含三個元素
# instance 文本
inputs = tokenizer.encode_plus(instance,
add_special_tokens=True,
max_length=max_sequence_length,
truncation_strategy='longest_first')
# 將編碼後的內容取出來
input_ids = inputs["input_ids"]
input_masks = inputs["attention_mask"]
input_segments = inputs["token_type_ids"]
padding_length = max_sequence_length - len(input_ids)
# 填充
padding_id = tokenizer.pad_token_id
input_ids = input_ids + ([padding_id] * padding_length)
input_masks = input_masks + ([0] * padding_length)
input_segments = input_segments + ([0] * padding_length)
return [input_ids, input_masks, input_segments]
# 將所有的文本進行保存
def compute_input_arrays(df, columns, tokenizer, max_sequence_length):
input_ids, input_masks, input_segments = [], [], []
for instance in tqdm(df[columns]):
ids, masks, segments = _convert_to_transformer_inputs(str(instance), tokenizer, max_sequence_length)
input_ids.append(ids)
input_masks.append(masks)
input_segments.append(segments)
return [np.asarray(input_ids, dtype=np.int32),
np.asarray(input_masks, dtype=np.int32),
np.asarray(input_segments, dtype=np.int32)
]
# 將訓練集和測試集同時變成三個序列
inputs = compute_input_arrays(df_train, input_categories, tokenizer, MAX_SEQUENCE_LENGTH)
test_inputs = compute_input_arrays(df_test, input_categories, tokenizer, MAX_SEQUENCE_LENGTH)
# 標籤類別轉化
# 改爲從0開始 變爲 0 1 2 只需要加一
def compute_output_arrays(df, columns):
return np.asarray(df[columns].astype(int) + 1)
outputs = compute_output_arrays(df_train, output_categories)
BERT模型
構建BERT模型
def create_model():
# 三個序列作爲輸入
input_id = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32)
input_mask = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32)
input_atn = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32)
# 導入bert模型
# output_hidden_states transformer中的每一層都可以取出來
config = BertConfig.from_pretrained(BERT_PATH + 'bert-base-chinese-config.json', output_hidden_states=True)
bert_model = TFBertModel.from_pretrained(BERT_PATH + 'bert-base-chinese-tf_model.h5', config=config)
# bert模型會返回三個向量
# sequence_output 最後一層transformer的向量 (bs,140,768) batchsize 文本的長度 每一個Token的向量
# pooler_output 通過pooling之後的到的結果
# hidden_states 12層的transformer
sequence_output, pooler_output, hidden_states = bert_model(input_id, attention_mask=input_mask,
token_type_ids=input_atn)
# (bs,140,768)(bs,768)
x = tf.keras.layers.GlobalAveragePooling1D()(sequence_output)
x = tf.keras.layers.Dropout(0.15)(x)
x = tf.keras.layers.Dense(3, activation='softmax')(x)
# 模型的定義
model = tf.keras.models.Model(inputs=[input_id, input_mask, input_atn], outputs=x)
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
# 定義loss 優化函數 和 metrics指標
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['acc', 'mae'])
return model
模型訓練
CV交叉驗證
gkf = StratifiedKFold(n_splits=5).split(X=df_train[input_categories].fillna('-1'), y=df_train[output_categories].fillna('-1'))
valid_preds = []
test_preds = []
# 簡單的進行分裂
for fold, (train_idx, valid_idx) in enumerate(gkf):
train_inputs = [inputs[i][train_idx] for i in range(len(inputs))]
train_outputs = to_categorical(outputs[train_idx])
valid_inputs = [inputs[i][valid_idx] for i in range(len(inputs))]
valid_outputs = to_categorical(outputs[valid_idx])
K.clear_session()
model = create_model()
# 進行模型訓練的部分
model.fit(train_inputs, train_outputs, validation_data= [valid_inputs, valid_outputs], epochs=2, batch_size=32)
# model.save_weights(f'bert-{fold}.h5')
valid_preds.append(model.predict(valid_inputs))
test_preds.append(model.predict(test_inputs))
df_test.head()
# 模型預測 1做平均 將概率相加取平均
sub = np.average(test_preds, axis=0)
sub = np.argmax(sub,axis=1)
# df_sub['y'] = sub-1
# #df_sub['id'] = df_sub['id'].apply(lambda x: str(x))
# df_sub.to_csv('test_sub.csv',index=False, encoding='utf-8')
# 要用測試集 微博id
df_sub = df_test[['微博id']]
df_sub.head()
# 將測試集寫入
# 預測的時候 是將結果+1 實際寫入的時候 要-1
df_sub['y'] = sub-1
df_sub.columns=['id','y']
df_sub.head()
df_sub.to_csv('test_sub.csv',index=False, encoding='utf-8')