【課程2.16】 合併 merge、join
Pandas具有全功能的,高性能內存中連接操作,與SQL等關係數據庫非常相似
1.merge合併 → 類似excel的vlookup
df1 = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
df2 = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
df3 = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
'key2': ['K0', 'K1', 'K0', 'K1'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
df4 = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
'key2': ['K0', 'K0', 'K0', 'K0'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
print(pd.merge(df1, df2, left_on='',right_on='',left_index=True ))
print('------')
# left:第一個df
# right:第二個df
# on:參考鍵
print(pd.merge(df3, df4, on=['key1','key2']))
# 多個鏈接鍵
----------------------------------------------------------------------
A B key C D
0 A0 B0 K0 C0 D0
1 A1 B1 K1 C1 D1
2 A2 B2 K2 C2 D2
3 A3 B3 K3 C3 D3
------
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A2 B2 K1 K0 C1 D1
2 A2 B2 K1 K0 C2 D2
2.參數how → 合併方式
print(pd.merge(df3, df4,on=['key1','key2'], how = 'inner'))
print('------')
# inner:默認,取交集
print(pd.merge(df3, df4, on=['key1','key2'], how = 'outer'))
print('------')
# outer:取並集,數據缺失範圍NaN
print(pd.merge(df3, df4, on=['key1','key2'], how = 'left'))
print('------')
# left:按照df3爲參考合併,數據缺失範圍NaN
print(pd.merge(df3, df4, on=['key1','key2'], how = 'right'))
# right:按照df4爲參考合併,數據缺失範圍NaN
----------------------------------------------------------------------
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A2 B2 K1 K0 C1 D1
2 A2 B2 K1 K0 C2 D2
------
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A1 B1 K0 K1 NaN NaN
2 A2 B2 K1 K0 C1 D1
3 A2 B2 K1 K0 C2 D2
4 A3 B3 K2 K1 NaN NaN
5 NaN NaN K2 K0 C3 D3
------
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A1 B1 K0 K1 NaN NaN
2 A2 B2 K1 K0 C1 D1
3 A2 B2 K1 K0 C2 D2
4 A3 B3 K2 K1 NaN NaN
------
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A2 B2 K1 K0 C1 D1
2 A2 B2 K1 K0 C2 D2
3 NaN NaN K2 K0 C3 D3
3.參數 left_on, right_on, left_index, right_index → 當鍵不爲一個列時,可以單獨設置左鍵與右鍵
df1 = pd.DataFrame({'lkey':list('bbacaab'),
'data1':range(7)})
df2 = pd.DataFrame({'rkey':list('abd'),
'date2':range(3)})
print(pd.merge(df1, df2, left_on='lkey', right_on='rkey'))
print('------')
# df1以‘lkey’爲鍵,df2以‘rkey’爲鍵
df1 = pd.DataFrame({'key':list('abcdfeg'),
'data1':range(7)})
df2 = pd.DataFrame({'date2':range(100,105)},
index = list('abcde'))
print(pd.merge(df1, df2, left_on='key', right_index=True))
# df1以‘key’爲鍵,df2以index爲鍵
# left_index:爲True時,第一個df以index爲鍵,默認False
# right_index:爲True時,第二個df以index爲鍵,默認False
# 所以left_on, right_on, left_index, right_index可以相互組合:
# left_on + right_on, left_on + right_index, left_index + right_on, left_index + right_index
----------------------------------------------------------------------
data1 lkey date2 rkey
0 0 b 1 b
1 1 b 1 b
2 6 b 1 b
3 2 a 0 a
4 4 a 0 a
5 5 a 0 a
------
data1 key date2
0 0 a 100
1 1 b 101
2 2 c 102
3 3 d 103
5 5 e 104
4.參數 sort
df1 = pd.DataFrame({'key':list('bbacaab'),
'data1':[1,3,2,4,5,9,7]})
df2 = pd.DataFrame({'key':list('abd'),
'date2':[11,2,33]})
x1 = pd.merge(df1,df2, on = 'key', how = 'outer')
x2 = pd.merge(df1,df2, on = 'key', sort=True, how = 'outer')
print(x1)
print(x2)
print('------')
# sort:按照字典順序通過 連接鍵 對結果DataFrame進行排序。默認爲False,設置爲False會大幅提高性能
print(x2.sort_values('data1'))
# 也可直接用Dataframe的排序方法:sort_values,sort_index
----------------------------------------------------------------------
data1 key date2
0 1.0 b 2.0
1 3.0 b 2.0
2 7.0 b 2.0
3 2.0 a 11.0
4 5.0 a 11.0
5 9.0 a 11.0
6 4.0 c NaN
7 NaN d 33.0
data1 key date2
0 2.0 a 11.0
1 5.0 a 11.0
2 9.0 a 11.0
3 1.0 b 2.0
4 3.0 b 2.0
5 7.0 b 2.0
6 4.0 c NaN
7 NaN d 33.0
------
data1 key date2
3 1.0 b 2.0
0 2.0 a 11.0
4 3.0 b 2.0
6 4.0 c NaN
1 5.0 a 11.0
5 7.0 b 2.0
2 9.0 a 11.0
7 NaN d 33.0
5.pd.join() → 直接通過索引鏈接
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
'B': ['B0', 'B1', 'B2']},
index=['K0', 'K1', 'K2'])
right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
'D': ['D0', 'D2', 'D3']},
index=['K0', 'K2', 'K3'])
print(left)
print(right)
print(left.join(right))
print(left.join(right, how='outer'))
print('-----')
# 等價於:pd.merge(left, right, left_index=True, right_index=True, how='outer')
df1 = pd.DataFrame({'key':list('bbacaab'),
'data1':[1,3,2,4,5,9,7]})
df2 = pd.DataFrame({'key':list('abd'),
'date2':[11,2,33]})
print(df1)
print(df2)
print(pd.merge(df1, df2, left_index=True, right_index=True, suffixes=('_1', '_2')))
print(df1.join(df2['date2']))
print('-----')
# suffixes=('_x', '_y')默認
left = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3'],
'key': ['K0', 'K1', 'K0', 'K1']})
right = pd.DataFrame({'C': ['C0', 'C1'],
'D': ['D0', 'D1']},
index=['K0', 'K1'])
print(left)
print(right)
print(left.join(right, on = 'key'))
# 等價於pd.merge(left, right, left_on='key', right_index=True, how='left', sort=False);
# left的‘key’和right的index
----------------------------------------------------------------------
A B
K0 A0 B0
K1 A1 B1
K2 A2 B2
C D
K0 C0 D0
K2 C2 D2
K3 C3 D3
A B C D
K0 A0 B0 C0 D0
K1 A1 B1 NaN NaN
K2 A2 B2 C2 D2
A B C D
K0 A0 B0 C0 D0
K1 A1 B1 NaN NaN
K2 A2 B2 C2 D2
K3 NaN NaN C3 D3
-----
data1 key
0 1 b
1 3 b
2 2 a
3 4 c
4 5 a
5 9 a
6 7 b
date2 key
0 11 a
1 2 b
2 33 d
data1 key_1 date2 key_2
0 1 b 11 a
1 3 b 2 b
2 2 a 33 d
data1 key date2
0 1 b 11.0
1 3 b 2.0
2 2 a 33.0
3 4 c NaN
4 5 a NaN
5 9 a NaN
6 7 b NaN
-----
A B key
0 A0 B0 K0
1 A1 B1 K1
2 A2 B2 K0
3 A3 B3 K1
C D
K0 C0 D0
K1 C1 D1
A B key C D
0 A0 B0 K0 C0 D0
1 A1 B1 K1 C1 D1
2 A2 B2 K0 C0 D0
3 A3 B3 K1 C1 D1