1.獲取數據
import tushare as ts
# 獲取代號爲000300的股票價格
cons=ts.get_apis()
df=ts.bar('000001', conn=cons, asset='INDEX', start_date='2018-01-01', end_date='')
2. 對於獲取的數據按日期進行升序排列,因爲我們要通過歷史的情況預測未來的情況
df=df.sort_index(ascending=True)
print(df.head(5))
3.取開盤價,收盤價,最高價,最低價,交易量五個特徵,並做標準化
df=df[["open","close","high","low","vol"]]
df=df.apply(lambda x:(x-min(x))/(max(x)-min(x)))
4.構造X和Y
思路:我們根據前n天的數據,預測當天的收盤價(close),例如,根據1月1日,1月2日,1月3日的數據(包含5個特徵) 預測 1月4日的收盤價(一個值)
比如:X=[ ["open1","close1","high1","low1","vol1"] ,["open2","close2","high2","low2","vol2"]["open3","close3","high3","low3","vol3"] ] Y=[ close4 ]
這個例子中,X對應的sequence length爲3,input_size=5 (這tm就是nlp中詞的embedding的概念)
我這邊是設定 sequence 長度爲5 ,就是根據前5天的數據來預測收盤價
sequence=5
X=[]
Y=[]
for i in range(df.shape[0]-sequence):
X.append(np.array(df.iloc[i:(i+sequence),].values,dtype=np.float32))
Y.append(np.array(df.iloc[(i+sequence),1],dtype=np.float32))
print(X[0])
print(Y[0])
5.劃分訓練集,測試集,構造數據迭代器等常規操作
class Mydataset(Dataset):
def __init__(self,xx,yy,transform=None):
self.x=xx
self.y=yy
self.tranform = transform
def __getitem__(self,index):
x1=self.x[index]
y1=self.y[index]
if self.tranform !=None:
return self.tranform(x1),y1
return x1,y1
def __len__(self):
return len(self.x)
# # 構建batch
trainx,trainy=X[:int(0.7*total_len)],Y[:int(0.7*total_len)]
testx,testy=X[int(0.7*total_len):],Y[int(0.7*total_len):]
train_loader=DataLoader(dataset=Mydataset(trainx,trainy,transform=transforms.ToTensor()), batch_size=12, shuffle=True)
test_loader=DataLoader(dataset=Mydataset(testx,testy), batch_size=12, shuffle=True)
6. 定義LSTM模型
class lstm(nn.Module):
def __init__(self,input_size=5,hidden_size=32,output_size=1):
super(lstm, self).__init__()
# lstm的輸入 #batch,seq_len, input_size
self.hidden_size=hidden_size
self.input_size=input_size
self.output_size=output_size
self.rnn=nn.LSTM(input_size=self.input_size,hidden_size=self.hidden_size,batch_first=True)
self.linear=nn.Linear(self.hidden_size,self.output_size)
def forward(self,x):
out,(hidden,cell)=self.rnn(x) # x.shape : batch,seq_len,hidden_size , hn.shape and cn.shape : num_layes * direction_numbers,batch,hidden_size
a,b,c=hidden.shape
out=self.linear(hidden.reshape(a*b,c))
return out
7.開始訓練模型
criterion=nn.MSELoss()
optimizer=optim.Adam(model.parameters(),lr=0.001)
preds=[]
labels=[]
for i in range(100):
total_loss=0
for idx,(data,label) in enumerate(train_loader):
data1=data.squeeze(1)
pred=model(Variable(data1))
label=label.unsqueeze(1)
loss=criterion(pred,label)
optimizer.zero_grad()
loss.backward()
optimizer.step()
total_loss+=loss.item()
8.開始測試,將預測的收盤價與實際的收盤價畫個圖,紅色表示預測的收盤價,藍色表示實際的收盤價
preds=[]
labels=[]
for idx, (x, label) in enumerate(test_loader):
x = x.squeeze(1) # batch_size,seq_len,input_size
pred=model(x)
preds.extend(pred.data.squeeze(1).tolist())
labels.extend(label.tolist())
下面畫圖,因爲之前做了標準化到0-1區間,所以我圖中要將收盤價恢復到原始的情況,全部取太密集,所以我只取了前50個進行比對
import matplotlib.pyplot as plt
plt.plot([ele*(close_max-close_min)+close_min for ele in preds[0:50]],"r",label="pred")
plt.plot([ele*(close_max-close_min)+close_min for ele in labels[0:50]],"b",label="real")
plt.show()