Python數據分析實戰【第三章】2.16-Pandas合併 merge、join【python】

【課程2.16】 合併 merge、join

Pandas具有全功能的,高性能內存中連接操作,與SQL等關係數據庫非常相似

1.merge合併 → 類似excel的vlookup


df1 = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                     'A': ['A0', 'A1', 'A2', 'A3'],
                     'B': ['B0', 'B1', 'B2', 'B3']})
df2 = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                      'C': ['C0', 'C1', 'C2', 'C3'],
                      'D': ['D0', 'D1', 'D2', 'D3']})
df3 = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
                    'key2': ['K0', 'K1', 'K0', 'K1'],
                    'A': ['A0', 'A1', 'A2', 'A3'],
                    'B': ['B0', 'B1', 'B2', 'B3']})
df4 = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
                    'key2': ['K0', 'K0', 'K0', 'K0'],
                    'C': ['C0', 'C1', 'C2', 'C3'],
                    'D': ['D0', 'D1', 'D2', 'D3']})
print(pd.merge(df1, df2, left_on='',right_on='',left_index=True ))
print('------')
# left:第一個df
# right:第二個df
# on:參考鍵

print(pd.merge(df3, df4, on=['key1','key2']))
# 多個鏈接鍵
----------------------------------------------------------------------
    A   B key   C   D
0  A0  B0  K0  C0  D0
1  A1  B1  K1  C1  D1
2  A2  B2  K2  C2  D2
3  A3  B3  K3  C3  D3
------
    A   B key1 key2   C   D
0  A0  B0   K0   K0  C0  D0
1  A2  B2   K1   K0  C1  D1
2  A2  B2   K1   K0  C2  D2

2.參數how → 合併方式


print(pd.merge(df3, df4,on=['key1','key2'], how = 'inner'))  
print('------')
# inner:默認,取交集

print(pd.merge(df3, df4, on=['key1','key2'], how = 'outer'))  
print('------')
# outer:取並集,數據缺失範圍NaN

print(pd.merge(df3, df4, on=['key1','key2'], how = 'left'))  
print('------')
# left:按照df3爲參考合併,數據缺失範圍NaN

print(pd.merge(df3, df4, on=['key1','key2'], how = 'right'))  
# right:按照df4爲參考合併,數據缺失範圍NaN
----------------------------------------------------------------------
    A   B key1 key2   C   D
0  A0  B0   K0   K0  C0  D0
1  A2  B2   K1   K0  C1  D1
2  A2  B2   K1   K0  C2  D2
------
     A    B key1 key2    C    D
0   A0   B0   K0   K0   C0   D0
1   A1   B1   K0   K1  NaN  NaN
2   A2   B2   K1   K0   C1   D1
3   A2   B2   K1   K0   C2   D2
4   A3   B3   K2   K1  NaN  NaN
5  NaN  NaN   K2   K0   C3   D3
------
    A   B key1 key2    C    D
0  A0  B0   K0   K0   C0   D0
1  A1  B1   K0   K1  NaN  NaN
2  A2  B2   K1   K0   C1   D1
3  A2  B2   K1   K0   C2   D2
4  A3  B3   K2   K1  NaN  NaN
------
     A    B key1 key2   C   D
0   A0   B0   K0   K0  C0  D0
1   A2   B2   K1   K0  C1  D1
2   A2   B2   K1   K0  C2  D2
3  NaN  NaN   K2   K0  C3  D3

3.參數 left_on, right_on, left_index, right_index → 當鍵不爲一個列時,可以單獨設置左鍵與右鍵


df1 = pd.DataFrame({'lkey':list('bbacaab'),
                   'data1':range(7)})
df2 = pd.DataFrame({'rkey':list('abd'),
                   'date2':range(3)})
print(pd.merge(df1, df2, left_on='lkey', right_on='rkey'))
print('------')
# df1以‘lkey’爲鍵,df2以‘rkey’爲鍵

df1 = pd.DataFrame({'key':list('abcdfeg'),
                   'data1':range(7)})
df2 = pd.DataFrame({'date2':range(100,105)},
                  index = list('abcde'))
print(pd.merge(df1, df2, left_on='key', right_index=True))
# df1以‘key’爲鍵,df2以index爲鍵
# left_index:爲True時,第一個df以index爲鍵,默認False
# right_index:爲True時,第二個df以index爲鍵,默認False

# 所以left_on, right_on, left_index, right_index可以相互組合:
# left_on + right_on, left_on + right_index, left_index + right_on, left_index + right_index
----------------------------------------------------------------------
  data1 lkey  date2 rkey
0      0    b      1    b
1      1    b      1    b
2      6    b      1    b
3      2    a      0    a
4      4    a      0    a
5      5    a      0    a
------
   data1 key  date2
0      0   a    100
1      1   b    101
2      2   c    102
3      3   d    103
5      5   e    104

4.參數 sort


df1 = pd.DataFrame({'key':list('bbacaab'),
                   'data1':[1,3,2,4,5,9,7]})
df2 = pd.DataFrame({'key':list('abd'),
                   'date2':[11,2,33]})
x1 = pd.merge(df1,df2, on = 'key', how = 'outer')
x2 = pd.merge(df1,df2, on = 'key', sort=True, how = 'outer')
print(x1)
print(x2)
print('------')
# sort:按照字典順序通過 連接鍵 對結果DataFrame進行排序。默認爲False,設置爲False會大幅提高性能

print(x2.sort_values('data1'))
# 也可直接用Dataframe的排序方法:sort_values,sort_index
----------------------------------------------------------------------
  data1 key  date2
0    1.0   b    2.0
1    3.0   b    2.0
2    7.0   b    2.0
3    2.0   a   11.0
4    5.0   a   11.0
5    9.0   a   11.0
6    4.0   c    NaN
7    NaN   d   33.0
   data1 key  date2
0    2.0   a   11.0
1    5.0   a   11.0
2    9.0   a   11.0
3    1.0   b    2.0
4    3.0   b    2.0
5    7.0   b    2.0
6    4.0   c    NaN
7    NaN   d   33.0
------
   data1 key  date2
3    1.0   b    2.0
0    2.0   a   11.0
4    3.0   b    2.0
6    4.0   c    NaN
1    5.0   a   11.0
5    7.0   b    2.0
2    9.0   a   11.0
7    NaN   d   33.0

5.pd.join() → 直接通過索引鏈接



left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
                     'B': ['B0', 'B1', 'B2']},
                    index=['K0', 'K1', 'K2'])
right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
                      'D': ['D0', 'D2', 'D3']},
                     index=['K0', 'K2', 'K3'])
print(left)
print(right)
print(left.join(right))
print(left.join(right, how='outer'))  
print('-----')
# 等價於:pd.merge(left, right, left_index=True, right_index=True, how='outer')

df1 = pd.DataFrame({'key':list('bbacaab'),
                   'data1':[1,3,2,4,5,9,7]})
df2 = pd.DataFrame({'key':list('abd'),
                   'date2':[11,2,33]})
print(df1)
print(df2)
print(pd.merge(df1, df2, left_index=True, right_index=True, suffixes=('_1', '_2')))  
print(df1.join(df2['date2']))
print('-----')
# suffixes=('_x', '_y')默認

left = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
                     'B': ['B0', 'B1', 'B2', 'B3'],
                     'key': ['K0', 'K1', 'K0', 'K1']})
right = pd.DataFrame({'C': ['C0', 'C1'],
                      'D': ['D0', 'D1']},
                     index=['K0', 'K1'])
print(left)
print(right)
print(left.join(right, on = 'key'))
# 等價於pd.merge(left, right, left_on='key', right_index=True, how='left', sort=False);
# left的‘key’和right的index
----------------------------------------------------------------------
     A   B
K0  A0  B0
K1  A1  B1
K2  A2  B2
     C   D
K0  C0  D0
K2  C2  D2
K3  C3  D3
     A   B    C    D
K0  A0  B0   C0   D0
K1  A1  B1  NaN  NaN
K2  A2  B2   C2   D2
      A    B    C    D
K0   A0   B0   C0   D0
K1   A1   B1  NaN  NaN
K2   A2   B2   C2   D2
K3  NaN  NaN   C3   D3
-----
   data1 key
0      1   b
1      3   b
2      2   a
3      4   c
4      5   a
5      9   a
6      7   b
   date2 key
0     11   a
1      2   b
2     33   d
   data1 key_1  date2 key_2
0      1     b     11     a
1      3     b      2     b
2      2     a     33     d
   data1 key  date2
0      1   b   11.0
1      3   b    2.0
2      2   a   33.0
3      4   c    NaN
4      5   a    NaN
5      9   a    NaN
6      7   b    NaN
-----
    A   B key
0  A0  B0  K0
1  A1  B1  K1
2  A2  B2  K0
3  A3  B3  K1
     C   D
K0  C0  D0
K1  C1  D1
    A   B key   C   D
0  A0  B0  K0  C0  D0
1  A1  B1  K1  C1  D1
2  A2  B2  K0  C0  D0
3  A3  B3  K1  C1  D1
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章