聚類案例之股票信息分類

附代碼:

 

import pandas as pd
beer = pd.read_csv('E:\\1\\000518.csv',sep=',')

In [3]:

 

beer

Out[3]:

  open close high low volume money
0 2.26 2.25 2.26 2.25 64047.0 144526.0
1 2.25 2.25 2.26 2.25 7020.0 15805.0
2 2.25 2.24 2.25 2.24 52599.0 118287.0
3 2.24 2.24 2.25 2.24 39098.0 87629.0
4 2.24 2.24 2.24 2.23 45794.0 102386.0
5 2.24 2.24 2.24 2.24 52923.0 118635.0
6 2.24 2.25 2.25 2.24 33374.0 74939.0
7 2.25 2.25 2.25 2.24 22465.0 50455.0
8 2.24 2.25 2.25 2.24 72471.0 162653.0
9 2.25 2.25 2.26 2.24 20845.0 46884.0
10 2.26 2.26 2.26 2.25 3888.0 8774.0
11 2.26 2.25 2.26 2.25 34886.0 78752.0
12 2.26 2.26 2.26 2.26 3348.0 7564.0
13 2.26 2.26 2.26 2.26 5184.0 11712.0
14 2.26 2.26 2.26 2.26 7236.0 16348.0
15 2.26 2.26 2.26 2.26 5400.0 12200.0
16 2.26 2.26 2.26 2.26 60267.0 136152.0
17 2.26 2.26 2.26 2.26 4320.0 9760.0
18 2.26 2.26 2.26 2.26 6588.0 14884.0
19 2.26 2.26 2.26 2.26 33482.0 75640.0
20 2.26 2.26 2.26 2.26 12745.0 28792.0
21 2.26 2.25 2.26 2.25 35642.0 80492.0
22 2.26 2.26 2.26 2.26 90184.0 203740.0
23 2.25 2.26 2.26 2.25 24841.0 55893.0
24 2.26 2.25 2.26 2.25 1620.0 3652.0
25 2.25 2.26 2.26 2.25 6804.0 15314.0
26 2.25 2.26 2.26 2.25 11881.0 26740.0
27 2.25 2.26 2.26 2.25 24409.0 55024.0
28 2.25 2.26 2.26 2.25 3780.0 8525.0
29 2.25 2.26 2.26 2.25 41366.0 93233.0
... ... ... ... ... ... ...
869850 3.28 3.28 3.28 3.28 300.0 984.0
869851 3.28 3.28 3.28 3.28 0.0 0.0
869852 3.29 3.29 3.29 3.29 100.0 329.0
869853 3.29 3.28 3.29 3.28 17200.0 56546.0
869854 3.29 3.28 3.29 3.28 1400.0 4597.0
869855 3.29 3.29 3.29 3.29 200.0 658.0
869856 3.28 3.29 3.29 3.28 4100.0 13459.0
869857 3.29 3.28 3.29 3.28 1400.0 4604.0
869858 3.28 3.28 3.28 3.28 0.0 0.0
869859 3.28 3.28 3.28 3.28 15200.0 49856.0
869860 3.28 3.28 3.28 3.28 5600.0 18368.0
869861 3.28 3.29 3.29 3.28 2300.0 7552.0
869862 3.29 3.29 3.29 3.29 0.0 0.0
869863 3.28 3.28 3.28 3.28 2000.0 6560.0
869864 3.29 3.28 3.29 3.28 9200.0 30245.0
869865 3.28 3.28 3.28 3.28 500.0 1640.0
869866 3.28 3.28 3.28 3.28 58800.0 192798.0
869867 3.28 3.28 3.28 3.28 1100.0 3608.0
869868 3.28 3.27 3.28 3.27 38600.0 126372.0
869869 3.27 3.27 3.27 3.27 27700.0 90579.0
869870 3.28 3.28 3.28 3.28 4400.0 14432.0
869871 3.28 3.28 3.28 3.28 1400.0 4592.0
869872 3.28 3.28 3.28 3.27 17100.0 56027.0
869873 3.28 3.28 3.28 3.28 31300.0 102729.0
869874 3.28 3.28 3.28 3.28 2000.0 6560.0
869875 3.28 3.28 3.28 3.28 3300.0 10824.0
869876 3.28 3.28 3.28 3.28 1300.0 4264.0
869877 3.28 3.28 3.28 3.28 0.0 0.0
869878 3.28 3.28 3.28 3.28 0.0 0.0
869879 3.28 3.29 3.29 3.28 3400.0 11156.0

869880 rows × 6 columns

In [6]:

 


 
X=beer[['open','close','high','low','volume','money']]

In [7]:

 


 
#K_means clustering
from sklearn.cluster import KMeans
km = KMeans(n_clusters=3).fit(X)
km2 = KMeans(n_clusters=2).fit(X)

In [8]:

 


 
km.labels_

Out[8]:

array([0, 0, 0, ..., 0, 0, 0])

In [12]:

 


 
beer['close1'] = km.labels_
beer['close2']=km.labels_
beer.sort_values('close1')

Out[12]:

  open close high low volume money close1 close2
0 0 0 2.26 2.25 64047.0 144526.0 0 0
576484 0 0 5.96 5.94 411400.0 2446780.0 0 0
576485 0 0 5.97 5.95 307400.0 1831532.0 0 0
576486 0 0 5.99 5.97 492800.0 2946828.0 0 0
576488 0 0 6.02 6.01 429200.0 2583728.0 0 0
576490 0 0 6.01 5.99 516800.0 3100452.0 0 0
576491 0 0 6.00 5.98 444700.0 2662148.0 0 0
576492 0 0 5.99 5.98 380300.0 2277268.0 0 0
576494 0 0 5.98 5.97 390200.0 2330280.0 0 0
576495 0 0 5.98 5.97 249300.0 1489820.0 0 0
576498 0 0 5.95 5.94 302000.0 1795104.0 0 0
576483 0 0 5.96 5.93 354600.0 2104872.0 0 0
576500 0 0 5.93 5.92 448400.0 2658368.0 0 0
576502 0 0 5.97 5.95 187800.0 1118504.0 0 0
576503 0 0 5.97 5.96 306200.0 1826672.0 0 0
576504 0 0 5.97 5.96 252900.0 1509208.0 0 0
576505 0 0 5.98 5.97 196600.0 1174200.0 0 0
576506 0 0 5.98 5.97 261900.0 1565096.0 0 0
576508 0 0 5.98 5.97 435200.0 2598160.0 0 0
576510 0 0 5.97 5.96 276400.0 1647848.0 0 0
576511 0 0 5.97 5.96 161100.0 960744.0 0 0
576512 0 0 5.97 5.95 299800.0 1786568.0 0 0
576513 0 0 5.97 5.95 139700.0 832264.0 0 0
576501 0 0 5.95 5.93 449800.0 2672864.0 0 0
576514 0 0 5.96 5.95 397000.0 2362656.0 0 0
576481 0 0 5.99 5.97 424400.0 2536606.0 0 0
576477 0 0 6.02 6.02 16700.0 100448.0 0 0
576425 0 0 5.80 5.79 192800.0 1117824.0 0 0
576427 0 0 5.81 5.80 213700.0 1240896.0 0 0
576428 0 0 5.81 5.80 235700.0 1368416.0 0 0
... ... ... ... ... ... ... ... ...
624961 2 2 4.86 4.83 704100.0 3403838.0 2 2
150393 2 2 5.90 5.87 821400.0 4834944.0 2 2
177084 2 2 6.91 6.89 575700.0 3972896.0 2 2
177087 2 2 6.90 6.88 492700.0 3394368.0 2 2
177089 2 2 6.90 6.89 492200.0 3394752.0 2 2
285619 2 2 4.43 4.38 2127800.0 9331944.0 2 2
177104 2 2 6.88 6.86 485600.0 3339008.0 2 2
177109 2 2 6.88 6.86 606800.0 4172352.0 2 2
133974 2 2 6.46 6.43 512000.0 3302224.0 2 2
823290 2 2 4.33 4.22 1876800.0 8073984.0 2 2
177112 2 2 6.87 6.86 475700.0 3267200.0 2 2
823292 2 2 4.30 4.18 918000.0 3885136.0 2 2
624982 2 2 4.82 4.79 846400.0 4066904.0 2 2
177113 2 2 6.87 6.86 1610000.0 11050560.0 2 2
335774 2 2 7.87 7.83 1177600.0 9240112.0 2 2
177114 2 2 6.87 6.86 528200.0 3626880.0 2 2
624978 2 2 4.83 4.79 1828100.0 8784036.0 2 2
177115 2 2 6.87 6.86 571900.0 3926976.0 2 2
177116 2 2 6.87 6.86 541800.0 3720832.0 2 2
384885 2 2 7.08 7.03 576100.0 4067248.0 2 2
177119 2 2 6.87 6.87 1158400.0 7958208.0 2 2
177120 2 2 6.92 6.89 1121600.0 7734116.0 2 2
177121 2 2 6.91 6.86 618700.0 4261458.0 2 2
285642 2 2 4.39 4.38 839800.0 3681336.0 2 2
624966 2 2 4.85 4.79 1627700.0 7836872.0 2 2
150396 2 2 5.89 5.85 599000.0 3517232.0 2 2
150395 2 2 5.89 5.87 570800.0 3357552.0 2 2
150394 2 2 5.90 5.87 1019900.0 5993008.0 2 2
624979 2 2 4.80 4.77 854100.0 4084476.0 2 2
341983 2 2 7.05 7.04 707300.0 4988704.0 2 2

869880 rows × 8 columns

In [16]:

 


 
from pandas.plotting import scatter_matrix
%matplotlib inline
close_centers = km.cluster_centers_
close_centers_2 = km2.cluster_centers_

In [22]:

 


 
beer.groupby("close").mean()

Out[22]:

  open high low volume money close1 close2
close              
0 0 4.678545 4.669329 7.491448e+04 3.665092e+05 0 0
1 1 7.283650 7.205276 6.855841e+06 4.644045e+07 1 1
2 2 6.804501 6.773411 9.541442e+05 6.112502e+06 2 2

In [20]:

 


 
beer.groupby("close2").mean()

Out[20]:

  open close high low volume money close1
close2              
0 0 0 4.678545 4.669329 7.491448e+04 3.665092e+05 0
1 1 1 7.283650 7.205276 6.855841e+06 4.644045e+07 1
2 2 2 6.804501 6.773411 9.541442e+05 6.112502e+06 2

In [33]:

 


 
centers = beer.groupby("close").mean().reset_index()

In [25]:

 


 
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['font.size'] = 14

In [26]:

 


 
import numpy as np
colors = np.array(['red','green','blue','yellow'])

In [34]:

 


 
plt.scatter(beer['open'],beer['high'],c = colors[beer['close']])
plt.scatter(centers.open,centers.high,linewidths=3,marker='+',s=300,c='black')
plt.xlabel("open")
plt.ylabel("high")

Out[34]:

Text(0, 0.5, 'high')

In [35]:

 


 
scatter_matrix(beer[["close","open","high","low"]],s=100,alpha=1,c=colors[beer["close"]],figsize=(10,10))
plt.suptitle("With 3 centroids initialized")

Out[35]:

Text(0.5, 0.98, 'With 3 centroids initialized')

In [39]:

 


 
#scaled data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled

Out[39]:

array([[-1.24466895, -1.24971352, -1.24622056, -1.24811754, -0.14123794,
        -0.24382383],
       [-1.24970706, -1.24971352, -1.24622056, -1.24811754, -0.36183077,
        -0.32424962],
       [-1.24970706, -1.2547516 , -1.25125501, -1.25315928, -0.18552129,
        -0.26021814],
       ...,
       [-0.73078238, -0.73079179, -0.73270656, -0.72881834, -0.38898565,
        -0.33412469],
       [-0.73078238, -0.73079179, -0.73270656, -0.72881834, -0.38898565,
        -0.33412469],
       [-0.73078238, -0.72575372, -0.72767211, -0.72881834, -0.37583371,
        -0.32715434]])

In [40]:

 


 
km = KMeans(n_clusters=3).fit(X_scaled)

In [41]:

 


 
beer["scaled_cluster"] = km.labels_
beer.sort_values("scaled_cluster")

Out[41]:

  open close high low volume money close1 close2 scaled_cluster
667794 0 0 6.79 6.79 9300.0 63147.0 0 0 0
644428 0 0 6.96 6.87 71100.0 492974.0 0 0 0
644429 0 0 6.94 6.89 144600.0 998647.0 0 0 0
644430 0 0 6.89 6.86 152000.0 1045036.0 0 0 0
644431 0 0 6.87 6.85 115000.0 788371.0 0 0 0
644432 0 0 6.86 6.84 188200.0 1287814.0 0 0 0
644433 0 0 6.87 6.84 136400.0 934928.0 0 0 0
644434 0 0 6.88 6.86 294500.0 2025171.0 0 0 0
644435 0 0 6.90 6.89 82400.0 568002.0 0 0 0
644436 0 0 6.90 6.88 133900.0 923330.0 0 0 0
644437 0 0 6.90 6.88 90400.0 622793.0 0 0 0
644438 0 0 6.89 6.87 150600.0 1036431.0 0 0 0
644439 0 0 6.90 6.89 65700.0 452861.0 0 0 0
644440 0 0 6.90 6.88 48600.0 335217.0 0 0 0
644441 0 0 6.93 6.89 99100.0 684674.0 0 0 0
644442 0 0 6.95 6.92 74300.0 515826.0 0 0 0
644443 0 0 6.95 6.93 70300.0 487831.0 0 0 0
644444 0 0 6.94 6.92 162100.0 1123415.0 0 0 0
644445 0 0 6.94 6.91 217100.0 1503263.0 0 0 0
644446 0 0 6.96 6.92 250700.0 1740438.0 0 0 0
644447 2 2 7.05 6.94 539600.0 3774772.0 2 2 0
644448 0 0 7.10 7.00 427200.0 3024468.0 0 0 0
644427 0 0 6.88 6.84 343700.0 2356016.0 0 0 0
644449 0 0 7.07 7.01 162300.0 1140414.0 0 0 0
644426 0 0 6.86 6.84 236600.0 1621721.0 0 0 0
644424 0 0 6.90 6.85 412200.0 2833977.0 0 0 0
644403 0 0 6.96 6.93 341900.0 2375505.0 0 0 0
644404 0 0 6.98 6.94 301900.0 2099317.0 0 0 0
644405 0 0 7.01 6.97 429800.0 3006154.0 0 0 0
644406 0 0 7.00 6.96 166800.0 1164742.0 0 0 0
... ... ... ... ... ... ... ... ... ...
618442 2 2 7.06 7.01 1116900.0 7846912.0 2 2 2
618441 2 2 7.02 7.00 1908000.0 13381184.0 2 2 2
618440 2 2 7.04 7.01 1622400.0 11398784.0 2 2 2
618439 2 2 7.07 7.03 1334500.0 9399104.0 2 2 2
618438 2 2 7.10 7.05 1093700.0 7742592.0 2 2 2
618437 2 2 7.13 7.10 1618000.0 11503744.0 2 2 2
631278 2 2 6.29 6.26 991200.0 6210959.0 2 2 2
569910 2 2 6.85 6.84 948300.0 6491008.0 2 2 2
569907 2 2 6.80 6.76 1801200.0 12228352.0 2 2 2
700290 2 2 9.08 9.05 947400.0 8585212.0 2 2 2
631284 2 2 6.24 6.20 1253900.0 7797035.0 2 2 2
247465 2 2 4.11 4.10 1247300.0 5114904.0 2 2 2
627605 2 2 5.92 5.92 1081100.0 6400640.0 2 2 2
591024 2 2 5.89 5.87 1119900.0 6586016.0 2 2 2
591023 2 2 5.89 5.88 1129000.0 6646656.0 2 2 2
591022 2 2 5.90 5.88 2324200.0 13693904.0 2 2 2
591021 2 2 5.88 5.85 2427700.0 14238800.0 2 2 2
591020 2 2 5.85 5.79 2601700.0 15139256.0 2 2 2
627604 2 2 5.92 5.92 2050400.0 12138376.0 2 2 2
627603 2 2 5.92 5.92 2218300.0 13132376.0 2 2 2
627602 1 1 5.92 5.85 8823500.0 52179664.0 1 1 2
627601 2 2 5.86 5.72 2469300.0 14324688.0 2 2 2
627600 1 1 5.80 5.67 8339200.0 47560426.0 1 1 2
658208 2 2 6.92 6.88 1306900.0 9012986.0 2 2 2
247449 2 2 4.10 4.09 1195200.0 4894624.0 2 2 2
247445 2 2 4.18 4.15 2356100.0 9817096.0 2 2 2
247444 2 2 4.18 4.16 5117300.0 21366048.0 2 2 2
618398 2 2 7.12 7.09 1734300.0 12318048.0 2 2 2
247443 2 2 4.16 4.12 2340600.0 9700918.0 2 2 2
361882 2 2 8.44 8.38 2697200.0 22710144.0 2 2 2

869880 rows × 9 columns

 

beer.groupby("scaled_cluster").mean()

In [42]:

 


 
beer.groupby("scaled_cluster").mean()

Out[42]:

  open close high low volume money close1 close2
scaled_cluster                
0 0.078958 0.078958 6.779438 6.767458 1.059397e+05 7.168576e+05 0.078958 0.078958
1 0.006126 0.006126 3.311344 3.303476 7.111992e+04 2.462302e+05 0.006126 0.006126
2 1.957469 1.957469 6.565434 6.524719 1.791183e+06 1.130658e+07 1.957469 1.957469

In [47]:

 


 
pd.plotting.scatter_matrix(X,c=colors[beer.scaled_cluster],alpha=1,figsize=(10,10),s=100)

Out[47]:

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x000000001DBC8A90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000001AB75390>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000001AB9B940>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000001ABCBEB8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000001AC054A8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000001AC329B0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000000001AC61F60>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000001AC9C588>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000001AC9C5C0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000001AD020F0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000001AD356A0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000001AD63C50>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000000001ADA1240>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000001ADD27F0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000001AE05DA0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000001AE44390>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000001AE73940>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000022126EF0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x00000000221634E0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000022193A90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000000221D3080>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000022203630>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000022235BE0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000000222731D0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x00000000222A4780>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000000222D5D30>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000022311320>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000000223428D0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000022374E80>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000000223B2470>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x00000000223E3A20>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000022416FD0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000000224525C0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000022482B70>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000000224C3160>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000000224F2710>]],
      dtype=object)

In [ ]:

 


 
from sklearn import metrics
score_scaled = metrics.silhouette_score(X,beer.scaled_cluster)
score = metrics.silhouette_score(X,beer.cluster)
print(score_scaled,score)

In [ ]:

 


 
scores = []
for k in range(2,20):
    labels = KMeans(n_clusters=k).fit(X).labels_
    score = metrics.silhouette_score(X,labels)
    scores.append(score)

 
scores

In [ ]:

 


 
plt.plot(list(range(2,20)),scores)
plt.xlabel("Number of Clusters Initialized")
plt.ylabel("Sihouette Score")

In [ ]:

 


 
from sklearn.cluster import DBSCAN
db = DBSCAN(eps=10,min_s)
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章