Jupyter--sentiment_bert_tf2

This is a modification of https://github/google-research/bert/blob/master/predicting_movie_reviews_with_bert_on_tf_hub.ipynb using the Tensorflow 2.0 Keras implementation of BERT from kpe/bert-for-tf2 with the original google-research/bert weights.

Predicting Movie Review Sentiment with kpe/bert-for-tf2

First install some prerequisites:

import os
import math
import datetime

from tqdm import tqdm

import pandas as pd
import numpy as np

import tensorflow as tf

tf.__version__

In addition to the standard libraries we imported above, we’ll need to install the bert-for-tf2 python package, and do the imports required for loading the pre-trained weights and tokenizing the input text.

import bert
from bert import BertModelLayer
from bert.loader import StockBertConfig, map_stock_config_to_params, load_stock_weights
from bert.tokenization.bert_tokenization import FullTokenizer

#Data

First, let’s download the dataset, hosted by Stanford. The code below, which downloads, extracts, and imports the IMDB Large Movie Review Dataset, is borrowed from this Tensorflow tutorial.

from tensorflow import keras
import os
import re

# Load all files from a directory in a DataFrame.
def load_directory_data(directory):
    data = {}
    data["sentence"] = []
    data["sentiment"] = []
    for file_path in tqdm(os.listdir(directory), desc=os.path.basename(directory)):
        with tf.io.gfile.GFile(os.path.join(directory, file_path), "r") as f:
            data["sentence"].append(f.read())
            data["sentiment"].append(re.match("\d+_(\d+)\.txt", file_path).group(1))
    return pd.DataFrame.from_dict(data)

# Merge positive and negative examples, add a polarity column and shuffle.
def load_dataset(directory):
    pos_df = load_directory_data(os.path.join(directory, "pos"))
    neg_df = load_directory_data(os.path.join(directory, "neg"))
    pos_df["polarity"] = 1
    neg_df["polarity"] = 0
    return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)

# Download and process the dataset files.
def download_and_load_datasets(force_download=False):
#     dataset = tf.keras.utils.get_file(
#       fname="aclImdb.tar.gz", 
#       origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
#       extract=True)
    
    dataset = "./aclImdb"
    train_df = load_dataset(os.path.join(os.path.dirname(dataset), 
                                       "aclImdb", "train"))
    test_df = load_dataset(os.path.join(os.path.dirname(dataset), 
                                      "aclImdb", "test"))

    return train_df, test_df

Let’s use the MovieReviewData class below, to prepare/encode
the data for feeding into our BERT model, by:

  • tokenizing the text
  • trim or pad it to a max_seq_len length
  • append the special tokens [CLS] and [SEP]
  • convert the string tokens to numerical IDs using the original model’s token encoding from `vocab.txt
import bert
from bert import BertModelLayer
from bert.loader import StockBertConfig, map_stock_config_to_params, load_stock_weights

from bert.tokenization import FullTokenizer

from bert import bert_tokenization

class MovieReviewData:
DATA_COLUMN = “sentence”
LABEL_COLUMN = “polarity”

def __init__(self, tokenizer: bert_tokenization.FullTokenizer, sample_size=None, max_seq_len=1024):
    self.tokenizer = tokenizer
    self.sample_size = sample_size
    self.max_seq_len = 0
    train, test = download_and_load_datasets()
    
    train, test = map(lambda df: df.reindex(df[MovieReviewData.DATA_COLUMN].str.len().sort_values().index), 
                      [train, test])
            
    if sample_size is not None:
        assert sample_size % 128 == 0
        train, test = train.head(sample_size), test.head(sample_size)
        # train, test = map(lambda df: df.sample(sample_size), [train, test])
    
    ((self.train_x, self.train_y),
     (self.test_x, self.test_y)) = map(self._prepare, [train, test])

    print("max seq_len", self.max_seq_len)
    self.max_seq_len = min(self.max_seq_len, max_seq_len)
    ((self.train_x, self.train_x_token_types),
     (self.test_x, self.test_x_token_types)) = map(self._pad, 
                                                   [self.train_x, self.test_x])

def _prepare(self, df):
    x, y = [], []
    with tqdm(total=df.shape[0], unit_scale=True) as pbar:
        for ndx, row in df.iterrows():
            text, label = row[MovieReviewData.DATA_COLUMN], row[MovieReviewData.LABEL_COLUMN]
            tokens = self.tokenizer.tokenize(text)
            tokens = ["[CLS]"] + tokens + ["[SEP]"]
            token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
            self.max_seq_len = max(self.max_seq_len, len(token_ids))
            x.append(token_ids)
            y.append(int(label))
            pbar.update()
    return np.array(x), np.array(y)

def _pad(self, ids):
    x, t = [], []
    token_type_ids = [0] * self.max_seq_len
    for input_ids in ids:
        input_ids = input_ids[:min(len(input_ids), self.max_seq_len - 2)]
        input_ids = input_ids + [0] * (self.max_seq_len - len(input_ids))
        x.append(np.array(input_ids))
        t.append(token_type_ids)
    return np.array(x), np.array(t)
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章