stacking in tensorflow2.0:Roberta集成

1.前言

使用stacking方法,提升tweet sentiment的抽取效果。其stacking代碼如下:
https://github.com/llq20133100095/tweet_sentiment_extraction/blob/other_mission2/thinking/ensamble/roberta-adversarial-dropout_0.715_en.ipynb

背景是kaggle的比賽:tweet_semtiment_extraction

2.方法

2.1 stacking

首先簡單介紹一下stacking的方法,它是一種比較常見的集成學習方法。由多層結果所組成。在具體的代碼中,我主要使用了兩層的結構:

  • 第一層結構:用了6個結果相似的模型進行搭建,6個模型的輸出進行求和平均,然後把結構輸入到第二層結構中;
  • 第二層結構:直接使用了softmax進行分類,得到預測結果。

STACKING
在實驗中,首先是把training data進行5-fold,把原有的training data分爲5份,其中1份作爲vaild data,剩餘的4份則作爲訓練集訓練每一個模型。進行了5-fold之後,得到5分vaild data的新特徵,這種新特徵就可以作爲第二層的模型特徵輸入。

在第二層的模型中,不需要用到太複雜的模型,用一些簡單的模型就可以了。

2.2 具體代碼

  • 運行第一層的6個模型,得到融合特徵:
def run_first_stack(fold, dataframe_stack_train1, dataframe_stack_train2):
    df_train_fold = data_df_5folds[data_df_5folds.kfold != fold].reset_index(drop=True)
    df_valid_fold = data_df_5folds[data_df_5folds.kfold == fold].reset_index(drop=True)
    
    num_train_batches = len(df_train_fold) // batch_size + int(len(df_train_fold) % batch_size != 0)
    num_eval_batches = len(df_valid_fold) // batch_size + int(len(df_valid_fold) % batch_size != 0)
    num_test_batches = len(test_df) // batch_size + int(len(test_df) % batch_size != 0)

    optimizer = tf.keras.optimizers.Adam(learning_rate)
    # optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(
    #     optimizer, 'dynamic')

    config = RobertaConfig.from_json_file(os.path.join(PATH, "config-roberta-base.json"))
    config.output_hidden_states = True
    config.num_labels = 2
    model1 = RoBertQAModel1.from_pretrained(os.path.join(PATH, "pretrained-roberta-base.h5"), config=config)
    model1.load_weights(f'../input/roberta-dropout02-adversarial/fold-{fold}.h5')
    
    model2 = RoBertQAModel2.from_pretrained(os.path.join(PATH, "pretrained-roberta-base.h5"), config=config)
    model2.load_weights(f'../input/tf-roberta-base/fold-{fold}.h5')
    
    model3 = RoBertQAModel3.from_pretrained(os.path.join(PATH, "pretrained-roberta-base.h5"), config=config)
    model3.load_weights(f'../input/tf-adversarial-training2/fold-{fold}.h5')
    
    model4 = RoBertQAModel4.from_pretrained(os.path.join(PATH, "pretrained-roberta-base.h5"), config=config)
    model4.load_weights(f'../input/tf-roberta-base-768/fold-{fold}.h5')
    
    model5 = RoBertQAModel5.from_pretrained(os.path.join(PATH, "pretrained-roberta-base.h5"), config=config)
    model5.load_weights(f'../input/tf-roberta-base-three/fold-{fold}.h5')
    
    model6 = RoBertQAModel1.from_pretrained(os.path.join(PATH, "pretrained-roberta-base.h5"), config=config)
    model6.load_weights(f'../input/tf-roberta-base-715/fold-{fold}.h5')
    
    # model7 = RoBertQAModel2.from_pretrained(os.path.join(PATH, "pretrained-roberta-base.h5"), config=config)
    # model7.load_weights(f'../input/tf-roberta-base-01/fold-{fold}.h5')
    
    loss_fn = focal_loss

    loss_step = []
    global_step = tf.Variable(0, name="global_step")
    # train_dataset = TweetSentimentDataset.create(
    #     df_train_fold, batch_size, shuffle_buffer_size=2048)
    valid_dataset = TweetSentimentDataset.create(
        df_valid_fold, batch_size, shuffle_buffer_size=-1)
    test_dataset = TweetSentimentDataset.create(
        test_df, batch_size, shuffle_buffer_size=-1)

    pred_start1, pred_end1, text, selected_text, sentiment, offset, vaild_target_start, vaild_target_end = \
        predict1(model1, valid_dataset, loss_fn, optimizer, num_eval_batches, fold)
    pred_start2, pred_end2, _, _, _, _ = \
        predict2(model2, valid_dataset, loss_fn, optimizer, num_eval_batches, fold)
    pred_start3, pred_end3, _, _, _, _, _, _ = \
        predict1(model3, valid_dataset, loss_fn, optimizer, num_eval_batches, fold)
    pred_start4, pred_end4, _, _, _, _ = \
        predict2(model4, valid_dataset, loss_fn, optimizer, num_eval_batches, fold)
    pred_start5, pred_end5, _, _, _, _ = \
        predict2(model5, valid_dataset, loss_fn, optimizer, num_eval_batches, fold)
    pred_start6, pred_end6, _, _, _, _, _, _ = \
        predict1(model6, valid_dataset, loss_fn, optimizer, num_eval_batches, fold)
    # pred_start7, pred_end7, _, _, _, _ = \
    #     predict2(model7, valid_dataset, loss_fn, optimizer, num_eval_batches, fold)
    
    pred_start1 += pred_start2 + pred_start3 + pred_start4 + pred_start5 + pred_start6
    pred_end1 += pred_end2 + pred_end3 + pred_end4 + pred_end5 + pred_end6
    pred_start1 = pred_start1 / 6
    pred_end1 = pred_end1 / 6
    
    if dataframe_stack_train1 is None:
        dataframe_stack_train1 = dataframe_stack_generate(pred_start1, pred_end1, text, selected_text, sentiment, offset, vaild_target_start, vaild_target_end, fold)
        dataframe_stack_train2 = dataframe_stack_generate(pred_start2, pred_end2, text, selected_text, sentiment, offset, vaild_target_start, vaild_target_end, fold)
    else:
        _dataframe_stack_train1 = dataframe_stack_generate(pred_start1, pred_end1, text, selected_text, sentiment, offset, vaild_target_start, vaild_target_end, fold)
        _dataframe_stack_train2 = dataframe_stack_generate(pred_start2, pred_end2, text, selected_text, sentiment, offset, vaild_target_start, vaild_target_end, fold)
        dataframe_stack_train1 = pd.concat([dataframe_stack_train1, _dataframe_stack_train1], axis=0)
        dataframe_stack_train2 = pd.concat([dataframe_stack_train2, _dataframe_stack_train2], axis=0)
        
    test_pred_start1, test_pred_end1, test_text, _, test_sentiment, test_offset, test_target_start, test_target_end = \
            predict1(model1, test_dataset, loss_fn, optimizer, num_test_batches, fold)
    test_pred_start2, test_pred_end2, _, _, _, _ = \
            predict2(model2, test_dataset, loss_fn, optimizer, num_test_batches, fold)
    test_pred_start3, test_pred_end3, _, _, _, _, _, _ = \
            predict1(model3, test_dataset, loss_fn, optimizer, num_test_batches, fold)
    test_pred_start4, test_pred_end4, _, _, _, _ = \
            predict2(model4, test_dataset, loss_fn, optimizer, num_test_batches, fold)
    test_pred_start5, test_pred_end5, _, _, _, _ = \
            predict2(model5, test_dataset, loss_fn, optimizer, num_test_batches, fold)
    test_pred_start6, test_pred_end6, _, _, _, _, _, _ = \
            predict1(model6, test_dataset, loss_fn, optimizer, num_test_batches, fold)
    # test_pred_start7, test_pred_end7, _, _, _, _ = \
    #         predict2(model7, test_dataset, loss_fn, optimizer, num_test_batches, fold)

    test_pred_start1 += test_pred_start2 + test_pred_start3 + test_pred_start4 + test_pred_start5 + test_pred_start6
    test_pred_end1 += test_pred_end2 + test_pred_end3 + test_pred_end4 + test_pred_end5 + test_pred_end6
    test_pred_start1 = test_pred_start1 / 6
    test_pred_end1 = test_pred_end1 / 6
    
    return dataframe_stack_train1, dataframe_stack_train2, test_pred_start1, test_pred_end1, test_text, test_sentiment, test_offset, test_target_start, test_target_end

上述代碼中,得到了平均的驗證集輸出特徵,同時也得到了測試集的平均值輸出特徵。
其中沒有進行重新訓練,而是把保存好的模型進行load_weights。

  • 第二層模型:
def run_second_stack(test_preds_start, test_preds_end, test_text, test_sentiment, test_offset, test_target_start, test_target_end):
    
    dataframe_stack_test = dataframe_stack_generate(test_preds_start, test_preds_end, test_text, test_text, test_sentiment, test_offset, test_target_start, test_target_end, 0)
    
    # initialize second test predictions
    test_preds_start = np.zeros((len(test_df), MAX_SEQUENCE_LENGTH), dtype=np.float32)
    test_preds_end = np.zeros((len(test_df), MAX_SEQUENCE_LENGTH), dtype=np.float32)
    
    # second train
    for fold in range(num_folds):
        dataframe_stack_data1_train = dataframe_stack_train1[dataframe_stack_train1.fold != fold].reset_index(drop=True)
        # dataframe_stack_data2_train = dataframe_stack_train2[dataframe_stack_train2.fold != fold].reset_index(drop=True)
        dataframe_stack_data1_vaild = dataframe_stack_train1[dataframe_stack_train1.fold == fold].reset_index(drop=True)
        
        num_train_batches = len(dataframe_stack_data1_train) // batch_size + int(len(dataframe_stack_data1_train) % batch_size != 0)
        num_eval_batches = len(dataframe_stack_data1_vaild) // batch_size + int(len(dataframe_stack_data1_vaild) % batch_size != 0)
        num_test_batches = len(dataframe_stack_test) // batch_size + int(len(dataframe_stack_test) % batch_size != 0)
        
        optimizer = tf.keras.optimizers.Adam(learning_rate)
        loss_fn = focal_loss
        
        # model 
        model = StackingDnn()
        
        loss_step = []
        global_step = tf.Variable(0, name="global_step")
        
        train_dataset1 = StackingDataset.create(
            dataframe_stack_data1_train, batch_size, shuffle_buffer_size=2048)
        # train_dataset2 = StackingDataset.create(
        #     dataframe_stack_data2_train, batch_size, shuffle_buffer_size=2048)
        valid_dataset = StackingDataset.create(
            dataframe_stack_data1_vaild, batch_size, shuffle_buffer_size=-1)
        test_dataset = StackingDataset.create(
            dataframe_stack_test, batch_size, shuffle_buffer_size=-1)
        
        best_score = float('-inf')
        for epoch_num in range(num_epochs):
            # train for an epoch
            stacking_train(model, train_dataset1, loss_fn, optimizer, global_step, loss_step, num_train_batches, fold)
            # stacking_train(model, train_dataset2, loss_fn, optimizer, global_step, loss_step, num_train_batches, fold)
    
            # predict validation set and compute jaccardian distances
            pred_start, pred_end, text, selected_text, sentiment, offset = \
                stacking_predict(model, valid_dataset, loss_fn, optimizer, num_eval_batches, fold)

            selected_text_pred = decode_prediction(pred_start, pred_end, text, offset, sentiment, is_testing=False)
            jaccards = []
            for i in range(len(selected_text)):
                jaccards.append(
                    jaccard(selected_text[i], selected_text_pred[i]))
    
            score = np.mean(jaccards)
    
            if epoch_num + 1 == num_epochs:
                plt.plot(list(range(global_step.numpy())), loss_step)
                plt.show()
            print("fold = %d , epoch = %d , jaccard = %f" % (fold, epoch_num+1, score))
    
            if score > best_score:
                best_score = score
                model.save_weights(f'fold-{fold}.h5')
    
                # predict test set
                test_pred_start, test_pred_end, test_text, _, test_sentiment, test_offset = \
                    stacking_predict(model, test_dataset, loss_fn, optimizer, num_test_batches, fold)
            
        best_score_list.append(best_score)
        # add epoch's best test preds to test preds arrays
        test_preds_start += test_pred_start
        test_preds_end += test_pred_end

        # reset model, as well as session and graph (to avoid OOM issues?) 
        session = tf.compat.v1.get_default_session()
        graph = tf.compat.v1.get_default_graph()
        del session, graph, model
        model = StackingDnn()
    return (test_preds_start, test_preds_end, test_text, test_sentiment, test_offset)

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章