python groupby


>>> import pandas as pd 

>>> df = pd.DataFrame({'key1':['a','a','b','b','a'],'key2':['one','two','one','two','one'],'data1':np.random.randn(5),'data2':np.random.randn(5)})
>>> df
      data1     data2 key1 key2
0 -0.123333  0.977207    a  one
1 -1.252341 -1.700965    a  two
2  1.293032 -1.942645    b  one
3  0.101999 -0.346671    b  two

4 -0.926819 -1.112701    a  one

#根據key1對data1進行分組 ,計算平均值,和統計。groupd爲對象,並沒有進行計算

>>> groupd =df['data1'].groupby(df['key1'])
>>> groupd.mean()
key1
a   -0.767497
b    0.697515
Name: data1, dtype: float64
>>> groupd.count()
key1
a    3
b    2
Name: data1, dtype: int64

#可以對兩列進行分組,求平均值,統計

>>> means = df['data1'].groupby([df['key1'],df['key2']]).mean()
>>> means
key1  key2
a     one    -0.525076
      two    -1.252341
b     one     1.293032
      two     0.101999
Name: data1, dtype: float64
>>> means = df['data1'].groupby([df['key1'],df['key2']]).count()
>>> means
key1  key2
a     one     2
      two     1
b     one     1
      two     1
Name: data1, dtype: int64


>>> means.unstack()
key2  one  two
key1          
a       2    1
b       1    1

#分組鍵可以是任意合適長度的數組

>>> states = np.array(['ohio','california','california','ohio','ohio'])

>>> years = np.array([2005,2005,2006,2005,2006])
>>> df['data1'].groupby([states,years]).mean()

>>> df['data1'].groupby([states,years]).mean()
california  2005   -1.252341
            2006    1.293032
ohio        2005   -0.010667
            2006   -0.926819
Name: data1, dtype: float64


>>> df.groupby('key1').mean()
         data1     data2
key1                    
a    -0.767497 -0.612153
b     0.697515 -1.144658

>>> df.groupby(['key1'.'key2']).count()
           data1  data2
key1 key2              
a    one       2      2
     two       1      1
b    one       1      1
     two       1      1

>>> df.groupby(['key1','key2']).size()
key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

>>> for n,g in df.groupby('key1'):
...    print(n)
...    print(g)
... 
a
      data1     data2 key1 key2
0 -0.123333  0.977207    a  one
1 -1.252341 -1.700965    a  two
4 -0.926819 -1.112701    a  one
b
      data1     data2 key1 key2
2  1.293032 -1.942645    b  one
3  0.101999 -0.346671    b  two
>>> for n,g in df.groupby(['key1','key2']):
...    print(n)
...    print(g)
... 
('a', 'one')
      data1     data2 key1 key2
0 -0.123333  0.977207    a  one
4 -0.926819 -1.112701    a  one
('a', 'two')
      data1     data2 key1 key2
1 -1.252341 -1.700965    a  two
('b', 'one')
      data1     data2 key1 key2
2  1.293032 -1.942645    b  one
('b', 'two')
      data1     data2 key1 key2
3  0.101999 -0.346671    b  two


#上述與下等效

>>> df.groupby('key1')['data1'].mean()
key1
a   -0.767497
b    0.697515
Name: data1, dtype: float64

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章