kaggle_tweent_w2v

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import os
from gensim.models.word2vec import Word2Vec
import tqdm
import tensorflow as tf
print(tf.__version__)
from tensorflow import keras
from sklearn.model_selection import train_test_split
2.0.0

Log

1:lstm

2:gru

#Load data
path_home = r"D:\pro\tianchi\kaggle_tweet"
data_train = pd.read_csv(os.path.join(path_home,"train.csv"),encoding="utf-8")
data_test = pd.read_csv(os.path.join(path_home,"test.csv"),encoding="utf-8")
data_submit = pd.read_csv(os.path.join(path_home,"sample_submission.csv"),encoding="utf-8")
import re
def cleanword(s):
    s = s.lower()
    temp = re.findall("http\S*",s)  
    for deletStr in temp:
        if deletStr != "":
            s = s.replace(deletStr," ")
    temp = re.findall("@\S*",s)
    for deletStr in temp:
        if deletStr != "":
            s = s.replace(deletStr," ")
    temp = re.findall("\d*",s)
    for deletStr in temp:
        if deletStr != "":
            s = s.replace(deletStr," ")
    temp = re.findall("\x89\S*",s)
    for deletStr in temp:
        if deletStr != "":
            s = s.replace(deletStr[:5]," ")

    s = s.replace("\n"," ")
    s = s.replace(","," ")
    s = s.replace("?"," ")
    s = s.replace("..."," ")
    s = s.replace("."," ")
    s = s.replace("["," ")
    s = s.replace("]"," ")
    s = s.replace("!"," ")
    s = s.replace(":"," ")
    s = s.replace("-"," ")
    s = s.replace("#"," ")
    s = s.replace("|"," ")
    s = s.replace("("," ")
    s = s.replace(")"," ")
    s = s.replace(";"," ")
    s = s.replace("="," ")
    s = s.replace(">"," ")
    s = s.replace("<"," ")
    s = s.replace("/"," ")
    

    #delet conntinue " "
    s_new = ""
    word = ""
    
    for i in range(len(s)):
        if s[i] != " " :
            word += s[i]
        else:
            if word != "":
                s_new = s_new + " " + word
                word = ""         
    if word != "":
        s_new += word
    s_new = s_new.strip()
    
    return s_new
data_test['text'] = data_test['text'].apply(cleanword)
data_train['text'] = data_train['text'].apply(cleanword)
sentences = []
for line in data_train['text'].values:
    sentences.append(list(line.split(" ")))
for line in data_test['text'].values:
    sentences.append(list(line.split(" ")))
print(len(sentences))
path_model_w2v = os.path.join(path_home,"w2v_model.model")
model_w2v = Word2Vec(
        sentences=sentences,
        size=200,#維度
        alpha=0.025, #默認
        window=5, #默認
        min_count=2,#2,3
        sample=0.001,#
        seed=2018, #
        workers=11, #線程
        min_alpha=0.0001, 
        sg=0, #cbow
        hs=0, #負採樣
        negative=5,#負採樣個數
        ns_exponent=0.75, 
        cbow_mean=1,#求和再取平均
        iter=10 #10到15
        )
model_w2v.save(path_model_w2v)
10876
data_train_copy = data_train.copy()
data_train_copy["words_length"] = data_train_copy['text'].apply(lambda x:len(x.split(" "))) 
# data_train_copy
np.percentile(data_train_copy['words_length'].tolist(),95)
23.0
def getvec(data):
    X = []
    for i in range(data.shape[0]):
        X_line = []
        line = list(data.values[i].split(" "))[:23]
        for word in line:
            try:
                X_line.append(model_w2v[word])
            except KeyError:
                pass
        if len(X_line) < 23:
            for j in range(23-len(X_line)):
                X_line.append(np.zeros(200))
        X.append(X_line)
    return np.array(X)
x_vec_train = getvec(data_train['text'])
x_vec_test = getvec(data_test["text"])
D:\anaconda\envs\python37-tf2\lib\site-packages\ipykernel_launcher.py:8: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).
print(x_vec_train.shape)
print(x_vec_test.shape)
(7613, 23, 200)
(3263, 23, 200)
x_train ,x_valid , y_train , y_valid = train_test_split(x_vec_train,data_train["target"].values) 
print(x_train.shape)
print(y_train.shape)
print(x_valid.shape)
print(y_valid.shape)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(5709, 23, 200)
(5709,)
(1904, 23, 200)
(1904,)
#build model

# print(dir(keras.layers))
# print(dir(keras.Model))
class GRUModel(tf.keras.Model):
    def __init__(self, batch_size, seq_length, cell_size):
        super().__init__()
        self.batch_size = batch_size
        self.seq_length = seq_length
        self.cell_size = cell_size

#         self.layer1 = tf.keras.layers.Reshape((-1,1), batch_size = self.batch_size)
        self.layer_GRU = tf.keras.layers.GRU(self.cell_size, return_sequences=True,input_shape())
        self.layer_last_GRU = tf.keras.layers.GRU(self.cell_size)
        self.layer_dense = tf.keras.layers.Dense(1)
        
    def call(self, inputs):
        x = self.layer1(inputs)
#         x = self.layer_input(inputs)
        x = self.layer_GRU(x)
        x = self.layer_last_GRU(x)
        output = self.layer_dense(x)
        return output

model_gru = GRUModel(batch_size=10,seq_length=23,cell_size=200)
model_gru.build(input_shape=(None,23,200))
model_gru.summary()

Model: "gru_model_15"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
reshape_15 (Reshape)         multiple                  0         
_________________________________________________________________
gru_28 (GRU)                 multiple                  121800    
_________________________________________________________________
gru_29 (GRU)                 multiple                  241200    
_________________________________________________________________
dense_14 (Dense)             multiple                  201       
=================================================================
Total params: 363,201
Trainable params: 363,201
Non-trainable params: 0
_________________________________________________________________
model_gru.compile(
    loss="binary_crossentropy",
    optimizer="adam",
    metrics=["accuracy"]
)
history = model_gru.fit(
    x_train,
    y_train,
    validation_data=(x_valid,y_valid),
    epochs = 2,
    batch_size= 10,
)

WARNING:tensorflow:Entity <bound method GRUModel.call of <__main__.GRUModel object at 0x000001DBF8395668>> could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: Failed to parse source code of <bound method GRUModel.call of <__main__.GRUModel object at 0x000001DBF8395668>>, which Python reported as:
    def call(self, inputs):
        x = self.layer1(inputs)
#         x = self.layer_input(inputs)
        x = self.layer_GRU(x)
        x = self.layer_last_GRU(x)
        output = self.layer_dense(x)
        return output

This may be caused by multiline strings or comments not indented at the same level as the code.
WARNING: Entity <bound method GRUModel.call of <__main__.GRUModel object at 0x000001DBF8395668>> could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: Failed to parse source code of <bound method GRUModel.call of <__main__.GRUModel object at 0x000001DBF8395668>>, which Python reported as:
    def call(self, inputs):
        x = self.layer1(inputs)
#         x = self.layer_input(inputs)
        x = self.layer_GRU(x)
        x = self.layer_last_GRU(x)
        output = self.layer_dense(x)
        return output

This may be caused by multiline strings or comments not indented at the same level as the code.
Train on 5709 samples, validate on 1904 samples
Epoch 1/2
  40/5709 [..............................] - ETA: 2:47:25 - loss: 8.2268 - accuracy: 0.4667


---------------------------------------------------------------------------

KeyboardInterrupt                         Traceback (most recent call last)

<ipython-input-81-117a90633db7> in <module>
      4     validation_data=(x_valid,y_valid),
      5     epochs = 2,
----> 6     batch_size= 10,
      7 )


D:\anaconda\envs\python37-tf2\lib\site-packages\tensorflow_core\python\keras\engine\training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
    726         max_queue_size=max_queue_size,
    727         workers=workers,
--> 728         use_multiprocessing=use_multiprocessing)
    729 
    730   def evaluate(self,


D:\anaconda\envs\python37-tf2\lib\site-packages\tensorflow_core\python\keras\engine\training_v2.py in fit(self, model, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, **kwargs)
    322                 mode=ModeKeys.TRAIN,
    323                 training_context=training_context,
--> 324                 total_epochs=epochs)
    325             cbks.make_logs(model, epoch_logs, training_result, ModeKeys.TRAIN)
    326 


D:\anaconda\envs\python37-tf2\lib\site-packages\tensorflow_core\python\keras\engine\training_v2.py in run_one_epoch(model, iterator, execution_function, dataset_size, batch_size, strategy, steps_per_epoch, num_samples, mode, training_context, total_epochs)
    121         step=step, mode=mode, size=current_batch_size) as batch_logs:
    122       try:
--> 123         batch_outs = execution_function(iterator)
    124       except (StopIteration, errors.OutOfRangeError):
    125         # TODO(kaftan): File bug about tf function and errors.OutOfRangeError?


D:\anaconda\envs\python37-tf2\lib\site-packages\tensorflow_core\python\keras\engine\training_v2_utils.py in execution_function(input_fn)
     84     # `numpy` translates Tensors to values in Eager mode.
     85     return nest.map_structure(_non_none_constant_value,
---> 86                               distributed_function(input_fn))
     87 
     88   return execution_function


D:\anaconda\envs\python37-tf2\lib\site-packages\tensorflow_core\python\eager\def_function.py in __call__(self, *args, **kwds)
    455 
    456     tracing_count = self._get_tracing_count()
--> 457     result = self._call(*args, **kwds)
    458     if tracing_count == self._get_tracing_count():
    459       self._call_counter.called_without_tracing()


D:\anaconda\envs\python37-tf2\lib\site-packages\tensorflow_core\python\eager\def_function.py in _call(self, *args, **kwds)
    485       # In this case we have created variables on the first call, so we run the
    486       # defunned version which is guaranteed to never create variables.
--> 487       return self._stateless_fn(*args, **kwds)  # pylint: disable=not-callable
    488     elif self._stateful_fn is not None:
    489       # Release the lock early so that multiple threads can perform the call


D:\anaconda\envs\python37-tf2\lib\site-packages\tensorflow_core\python\eager\function.py in __call__(self, *args, **kwargs)
   1821     """Calls a graph function specialized to the inputs."""
   1822     graph_function, args, kwargs = self._maybe_define_function(args, kwargs)
-> 1823     return graph_function._filtered_call(args, kwargs)  # pylint: disable=protected-access
   1824 
   1825   @property


D:\anaconda\envs\python37-tf2\lib\site-packages\tensorflow_core\python\eager\function.py in _filtered_call(self, args, kwargs)
   1139          if isinstance(t, (ops.Tensor,
   1140                            resource_variable_ops.BaseResourceVariable))),
-> 1141         self.captured_inputs)
   1142 
   1143   def _call_flat(self, args, captured_inputs, cancellation_manager=None):


D:\anaconda\envs\python37-tf2\lib\site-packages\tensorflow_core\python\eager\function.py in _call_flat(self, args, captured_inputs, cancellation_manager)
   1222     if executing_eagerly:
   1223       flat_outputs = forward_function.call(
-> 1224           ctx, args, cancellation_manager=cancellation_manager)
   1225     else:
   1226       gradient_name = self._delayed_rewrite_functions.register()


D:\anaconda\envs\python37-tf2\lib\site-packages\tensorflow_core\python\eager\function.py in call(self, ctx, args, cancellation_manager)
    509               inputs=args,
    510               attrs=("executor_type", executor_type, "config_proto", config),
--> 511               ctx=ctx)
    512         else:
    513           outputs = execute.execute_with_cancellation(


D:\anaconda\envs\python37-tf2\lib\site-packages\tensorflow_core\python\eager\execute.py in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
     59     tensors = pywrap_tensorflow.TFE_Py_Execute(ctx._handle, device_name,
     60                                                op_name, inputs, attrs,
---> 61                                                num_outputs)
     62   except core._NotOkStatusException as e:
     63     if name is not None:


KeyboardInterrupt: 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章