csv數據下載連接:https://pan.baidu.com/s/1KTS5WzfH4z9Y4U4rIG-3Ig
代碼:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import KMeans # 引入機器學習
# 用來正常顯示中文
plt.rcParams['font.sans-serif'] = ['SimHei']
# 用來正常顯示符號
plt.rcParams['axes.unicode_minus'] = False
class GuPiaoData():
def detedData(self,filePath): # 探索數據
df = pd.read_csv(filePath)
# print(df)
describe = df.describe(include='all')
# print(describe.T)
df.to_excel('data/Gupiaodata01.xls')
describe.T.to_excel('data/gupiaodata_describe.xls')
def cleanData(self,filePath):
'''
清洗無效數據:空值行,不在合理的數據範圍的行
:param filepath:
:return:
'''
df = pd.read_excel(filePath)
#過濾非法值
filter1 = df['Turnover rate'].notnull()#轉手率
filter2 = df['Range of Rise and Fall'].notnull() # 漲跌幅
filter3 = df['Price-earning ratio'].notnull() #市盈率
filter4 = df['Price-to-book ratio'].notnull() # 市淨率
filter5 = df['Marketing rate'].notnull() # 市銷率
filter6 = df['Realization rate'].notnull() # 市現率
filters = filter1 & filter2 & filter3 & filter4 & filter5 & filter6
df=df[filters]
df.to_excel('data/gupiaodata_clean.xls')
def chooseData(self,filepath):
'''
從清洗後的數據中選取需要使用的列
:param filepath: 清洗完成之後保存的數據文件路徑
:return:
'''
df = pd.read_excel(filepath)
df =df[['Turnover rate','Range of Rise and Fall','Price-earning ratio',
'Price-to-book ratio','Marketing rate','Realization rate']]
df.to_excel('data/Gupiaodata_coredata.xls')
def transformData(self,filePath):
#對數據進行轉換
df = pd.read_excel(filePath)
df['換手率'] = df['Turnover rate']
df['漲跌率'] = df['Range of Rise and Fall']
df['市盈率'] = df['Price-earning ratio']
df['市淨率'] = df['Price-to-book ratio']
df['市銷率'] = df['Marketing rate']
df['市現率'] = df['Realization rate']
df = df[['換手率','漲跌率','市盈率','市淨率','市銷率','市現率']]
df.to_excel('data/Gupiaodata_coretransformdata.xls')
def standarData(self,filepath):
'''
一般標準化的方式:(原數據-平均值)/標準差
:param filepath:
:return:
'''
df = pd.read_excel(filepath)
df = (df - np.mean(df,axis=0))/np.std(df,axis=0)
df[['換手率','漲跌率','市盈率','市淨率','市銷率','市現率']].to_excel('data/Gupiao_stdcoredata.xls')
pass
def classifyData(self,filepath,k =5):
df = pd.read_excel(filepath)
kmeans = KMeans(k)
kmeans.fit(df[['換手率','漲跌率','市盈率','市淨率','市銷率','市現率']])
print(kmeans.cluster_centers_)
print(kmeans.labels_)
# return kmeans.cluster_centers_()
df['label'] = kmeans.labels_
#df.to_excel('data/air_result.xls')
coreData = pd.DataFrame(kmeans.cluster_centers_)
#coreData.to_excel('data/air_core.xls')
# 繪製雷達圖
# 組織數據
#構造x軸值
xdata = np.linspace(0,2*np.pi,k,endpoint=False)
xdata = np.concatenate((xdata, [xdata[0]]))
ydata1 = np.concatenate((coreData[0],[coreData[0][0]]))
ydata2= np.concatenate((coreData[1], [coreData[1][0]]))
ydata3 = np.concatenate((coreData[2], [coreData[2][0]]))
ydata4 = np.concatenate((coreData[3], [coreData[3][0]]))
ydata5 = np.concatenate((coreData[4], [coreData[4][0]]))
ydata6 = np.concatenate((coreData[5], [coreData[5][0]]))
fig = plt.figure()
ax = fig.add_subplot(111,polar=True)
ax.plot(xdata,ydata1, 'r--', linewidth=1,label='換手率')
ax.plot(xdata, ydata2, 'g--', linewidth=1, label='漲跌率')
ax.plot(xdata, ydata3, 'b--', linewidth=1, label='市盈率')
ax.plot(xdata, ydata4, 'o--', linewidth=1, label='市淨率')
ax.plot(xdata, ydata5, 'y--', linewidth=1, label='市銷率')
ax.plot(xdata, ydata6, 'b--', linewidth=1, label='市現率')
ax.set_thetagrids(xdata*180/np.pi,['換手率','漲跌率','市盈率','市淨率','市銷率','市現率'])
ax.set_rlim(-2,4)
plt.legend(loc = 'best')
plt.show()
if __name__ == '__main__':
gupiao = GuPiaoData()
# gupiao.detedData('szgupiaodata.csv')
# gupiao.cleanData('data/Gupiaodata01.xls')
# gupiao.chooseData('data/Gupiaodata_clean.xls')
#gupiao.transformData('data/Gupiaodata_coredata.xls')
# gupiao.standarData('data/Gupiaodata_coretransformdata.xls')
gupiao.classifyData('data/Gupiao_stdcoredata.xls',k=6)