import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import os
from gensim.models.word2vec import Word2Vec
import tqdm
import tensorflow as tf
print(tf.__version__)
from tensorflow import keras
from sklearn.model_selection import train_test_split
2.0.0
Log
1:lstm
2:gru
#Load data
path_home = r"D:\pro\tianchi\kaggle_tweet"
data_train = pd.read_csv(os.path.join(path_home,"train.csv"),encoding="utf-8")
data_test = pd.read_csv(os.path.join(path_home,"test.csv"),encoding="utf-8")
data_submit = pd.read_csv(os.path.join(path_home,"sample_submission.csv"),encoding="utf-8")
import re
def cleanword(s):
s = s.lower()
temp = re.findall("http\S*",s)
for deletStr in temp:
if deletStr != "":
s = s.replace(deletStr," ")
temp = re.findall("@\S*",s)
for deletStr in temp:
if deletStr != "":
s = s.replace(deletStr," ")
temp = re.findall("\d*",s)
for deletStr in temp:
if deletStr != "":
s = s.replace(deletStr," ")
temp = re.findall("\x89\S*",s)
for deletStr in temp:
if deletStr != "":
s = s.replace(deletStr[:5]," ")
s = s.replace("\n"," ")
s = s.replace(","," ")
s = s.replace("?"," ")
s = s.replace("..."," ")
s = s.replace("."," ")
s = s.replace("["," ")
s = s.replace("]"," ")
s = s.replace("!"," ")
s = s.replace(":"," ")
s = s.replace("-"," ")
s = s.replace("#"," ")
s = s.replace("|"," ")
s = s.replace("("," ")
s = s.replace(")"," ")
s = s.replace(";"," ")
s = s.replace("="," ")
s = s.replace(">"," ")
s = s.replace("<"," ")
s = s.replace("/"," ")
#delet conntinue " "
s_new = ""
word = ""
for i in range(len(s)):
if s[i] != " " :
word += s[i]
else:
if word != "":
s_new = s_new + " " + word
word = ""
if word != "":
s_new += word
s_new = s_new.strip()
return s_new
data_test['text'] = data_test['text'].apply(cleanword)
data_train['text'] = data_train['text'].apply(cleanword)
sentences = []
for line in data_train['text'].values:
sentences.append(list(line.split(" ")))
for line in data_test['text'].values:
sentences.append(list(line.split(" ")))
print(len(sentences))
path_model_w2v = os.path.join(path_home,"w2v_model.model")
model_w2v = Word2Vec(
sentences=sentences,
size=200,#維度
alpha=0.025, #默認
window=5, #默認
min_count=2,#2,3
sample=0.001,#
seed=2018, #
workers=11, #線程
min_alpha=0.0001,
sg=0, #cbow
hs=0, #負採樣
negative=5,#負採樣個數
ns_exponent=0.75,
cbow_mean=1,#求和再取平均
iter=10 #10到15
)
model_w2v.save(path_model_w2v)
10876
data_train_copy = data_train.copy()
data_train_copy["words_length"] = data_train_copy['text'].apply(lambda x:len(x.split(" ")))
# data_train_copy
np.percentile(data_train_copy['words_length'].tolist(),95)
23.0
def getvec(data):
X = []
for i in range(data.shape[0]):
X_line = []
line = list(data.values[i].split(" "))[:23]
for word in line:
try:
X_line.append(model_w2v[word])
except KeyError:
pass
if len(X_line) < 23:
for j in range(23-len(X_line)):
X_line.append(np.zeros(200))
X.append(X_line)
return np.array(X)
x_vec_train = getvec(data_train['text'])
x_vec_test = getvec(data_test["text"])
D:\anaconda\envs\python37-tf2\lib\site-packages\ipykernel_launcher.py:8: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).
print(x_vec_train.shape)
print(x_vec_test.shape)
(7613, 23, 200)
(3263, 23, 200)
x_train ,x_valid , y_train , y_valid = train_test_split(x_vec_train,data_train["target"].values)
print(x_train.shape)
print(y_train.shape)
print(x_valid.shape)
print(y_valid.shape)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(5709, 23, 200)
(5709,)
(1904, 23, 200)
(1904,)
#build model
# print(dir(keras.layers))
# print(dir(keras.Model))
class GRUModel(tf.keras.Model):
def __init__(self, batch_size, seq_length, cell_size):
super().__init__()
self.batch_size = batch_size
self.seq_length = seq_length
self.cell_size = cell_size
# self.layer1 = tf.keras.layers.Reshape((-1,1), batch_size = self.batch_size)
self.layer_GRU = tf.keras.layers.GRU(self.cell_size, return_sequences=True,input_shape())
self.layer_last_GRU = tf.keras.layers.GRU(self.cell_size)
self.layer_dense = tf.keras.layers.Dense(1)
def call(self, inputs):
x = self.layer1(inputs)
# x = self.layer_input(inputs)
x = self.layer_GRU(x)
x = self.layer_last_GRU(x)
output = self.layer_dense(x)
return output
model_gru = GRUModel(batch_size=10,seq_length=23,cell_size=200)
model_gru.build(input_shape=(None,23,200))
model_gru.summary()
Model: "gru_model_15"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
reshape_15 (Reshape) multiple 0
_________________________________________________________________
gru_28 (GRU) multiple 121800
_________________________________________________________________
gru_29 (GRU) multiple 241200
_________________________________________________________________
dense_14 (Dense) multiple 201
=================================================================
Total params: 363,201
Trainable params: 363,201
Non-trainable params: 0
_________________________________________________________________
model_gru.compile(
loss="binary_crossentropy",
optimizer="adam",
metrics=["accuracy"]
)
history = model_gru.fit(
x_train,
y_train,
validation_data=(x_valid,y_valid),
epochs = 2,
batch_size= 10,
)
WARNING:tensorflow:Entity <bound method GRUModel.call of <__main__.GRUModel object at 0x000001DBF8395668>> could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: Failed to parse source code of <bound method GRUModel.call of <__main__.GRUModel object at 0x000001DBF8395668>>, which Python reported as:
def call(self, inputs):
x = self.layer1(inputs)
# x = self.layer_input(inputs)
x = self.layer_GRU(x)
x = self.layer_last_GRU(x)
output = self.layer_dense(x)
return output
This may be caused by multiline strings or comments not indented at the same level as the code.
WARNING: Entity <bound method GRUModel.call of <__main__.GRUModel object at 0x000001DBF8395668>> could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: Failed to parse source code of <bound method GRUModel.call of <__main__.GRUModel object at 0x000001DBF8395668>>, which Python reported as:
def call(self, inputs):
x = self.layer1(inputs)
# x = self.layer_input(inputs)
x = self.layer_GRU(x)
x = self.layer_last_GRU(x)
output = self.layer_dense(x)
return output
This may be caused by multiline strings or comments not indented at the same level as the code.
Train on 5709 samples, validate on 1904 samples
Epoch 1/2
40/5709 [..............................] - ETA: 2:47:25 - loss: 8.2268 - accuracy: 0.4667
---------------------------------------------------------------------------
KeyboardInterrupt Traceback (most recent call last)
<ipython-input-81-117a90633db7> in <module>
4 validation_data=(x_valid,y_valid),
5 epochs = 2,
----> 6 batch_size= 10,
7 )
D:\anaconda\envs\python37-tf2\lib\site-packages\tensorflow_core\python\keras\engine\training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
726 max_queue_size=max_queue_size,
727 workers=workers,
--> 728 use_multiprocessing=use_multiprocessing)
729
730 def evaluate(self,
D:\anaconda\envs\python37-tf2\lib\site-packages\tensorflow_core\python\keras\engine\training_v2.py in fit(self, model, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, **kwargs)
322 mode=ModeKeys.TRAIN,
323 training_context=training_context,
--> 324 total_epochs=epochs)
325 cbks.make_logs(model, epoch_logs, training_result, ModeKeys.TRAIN)
326
D:\anaconda\envs\python37-tf2\lib\site-packages\tensorflow_core\python\keras\engine\training_v2.py in run_one_epoch(model, iterator, execution_function, dataset_size, batch_size, strategy, steps_per_epoch, num_samples, mode, training_context, total_epochs)
121 step=step, mode=mode, size=current_batch_size) as batch_logs:
122 try:
--> 123 batch_outs = execution_function(iterator)
124 except (StopIteration, errors.OutOfRangeError):
125 # TODO(kaftan): File bug about tf function and errors.OutOfRangeError?
D:\anaconda\envs\python37-tf2\lib\site-packages\tensorflow_core\python\keras\engine\training_v2_utils.py in execution_function(input_fn)
84 # `numpy` translates Tensors to values in Eager mode.
85 return nest.map_structure(_non_none_constant_value,
---> 86 distributed_function(input_fn))
87
88 return execution_function
D:\anaconda\envs\python37-tf2\lib\site-packages\tensorflow_core\python\eager\def_function.py in __call__(self, *args, **kwds)
455
456 tracing_count = self._get_tracing_count()
--> 457 result = self._call(*args, **kwds)
458 if tracing_count == self._get_tracing_count():
459 self._call_counter.called_without_tracing()
D:\anaconda\envs\python37-tf2\lib\site-packages\tensorflow_core\python\eager\def_function.py in _call(self, *args, **kwds)
485 # In this case we have created variables on the first call, so we run the
486 # defunned version which is guaranteed to never create variables.
--> 487 return self._stateless_fn(*args, **kwds) # pylint: disable=not-callable
488 elif self._stateful_fn is not None:
489 # Release the lock early so that multiple threads can perform the call
D:\anaconda\envs\python37-tf2\lib\site-packages\tensorflow_core\python\eager\function.py in __call__(self, *args, **kwargs)
1821 """Calls a graph function specialized to the inputs."""
1822 graph_function, args, kwargs = self._maybe_define_function(args, kwargs)
-> 1823 return graph_function._filtered_call(args, kwargs) # pylint: disable=protected-access
1824
1825 @property
D:\anaconda\envs\python37-tf2\lib\site-packages\tensorflow_core\python\eager\function.py in _filtered_call(self, args, kwargs)
1139 if isinstance(t, (ops.Tensor,
1140 resource_variable_ops.BaseResourceVariable))),
-> 1141 self.captured_inputs)
1142
1143 def _call_flat(self, args, captured_inputs, cancellation_manager=None):
D:\anaconda\envs\python37-tf2\lib\site-packages\tensorflow_core\python\eager\function.py in _call_flat(self, args, captured_inputs, cancellation_manager)
1222 if executing_eagerly:
1223 flat_outputs = forward_function.call(
-> 1224 ctx, args, cancellation_manager=cancellation_manager)
1225 else:
1226 gradient_name = self._delayed_rewrite_functions.register()
D:\anaconda\envs\python37-tf2\lib\site-packages\tensorflow_core\python\eager\function.py in call(self, ctx, args, cancellation_manager)
509 inputs=args,
510 attrs=("executor_type", executor_type, "config_proto", config),
--> 511 ctx=ctx)
512 else:
513 outputs = execute.execute_with_cancellation(
D:\anaconda\envs\python37-tf2\lib\site-packages\tensorflow_core\python\eager\execute.py in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
59 tensors = pywrap_tensorflow.TFE_Py_Execute(ctx._handle, device_name,
60 op_name, inputs, attrs,
---> 61 num_outputs)
62 except core._NotOkStatusException as e:
63 if name is not None:
KeyboardInterrupt: