# -*- coding: utf-8 -*-
"""
Created on Fri Oct 25 16:41:03 2019
@author: weiping
"""
import pandas as pd
import numpy as np
df1 = pd.DataFrame({'name':['a','b','c','d','f','g'],
'key1':[1,1,1,2,2,2],
'date1':[3,34,5,6,4,8]})
df2 = pd.DataFrame({'name':['a','b','c','d','e','h'],
'key1':[1,1,1,3,3,3],
'date2':[33,44,23,45,66,77]})
df1,df2
'''
數據框之間的左右連接(類似SQL中的表關聯)
'''
df = pd.merge(df1,df2,on = 'name',how = 'left')#不限制how的內容 默認 inner
df
df = pd.merge(df1,df2,left_on = 'name',right_on = 'name',how = 'right')
df
df = pd.merge(df1,df2,on = ['name','key1'],how = 'left') # 聯合字段關聯
df
'''
索引上的連接
'''
df1 = pd.DataFrame({"key":['q','w','e','r','q','e','e'],
"data": range(7)})
df2 = pd.DataFrame({"da":[3,6,7]},index = ['a','q','e'])
df1,df2
pd.merge(df1,df2,left_on = 'key',right_index =True,how = 'right')
'''
軸向連接(pd.concat)
'''
#數組的軸向連接
arr = np.arange(12).reshape(3,4)
arr1 = np.arange(16).reshape(4,4)
np.concatenate([arr,arr],axis = 1 ) # 按列拼接
np.concatenate([arr,arr1],axis = 1)#報錯 數組拼接 必須軸向數據條數一致
#數據框的軸向連接
df1 = pd.DataFrame(arr,columns = ['a','b','c','d'])
pd.concat([df1,df1]) # 默認axis = 0 按行連接
pd.concat([df1,df1],axis = 1 ) # 按列連接
'''
合併重疊數據(打補丁)
'''
a = pd.DataFrame(list(range(7)),columns=['c1'])
a.ix[2:3] = np.nan
a.ix[4:5] = np.nan
b = pd.DataFrame(list(range(4,11)),columns = ['c2'])
#np.where
c = pd.DataFrame(np.where(pd.isnull(a),b,a),columns = ['c3']) # np.where 類似 if else 返回數組
#combine_first
a['c1'].combine_first(b['c2']) # 與np.where 效果一致 返回 數據框
'''
長表 轉化 寬表
'''
df = pd.DataFrame({'t1':[11,11,11,22,22,22,33,33,33],
'item':['a','b','c','a','b','c','a','b','c'],
'value':[3.4,3,4,5,6,7,8,8,9]})
df
'''
Out[59]:
t1 item value
0 11 a 3.4
1 11 b 3.0
2 11 c 4.0
3 22 a 5.0
4 22 b 6.0
5 22 c 7.0
6 33 a 8.0
7 33 b 8.0
8 33 c 9.0
'''
df2 = df.pivot('item','t1')
df2
'''
Out[61]:
value
t1 11 22 33
item
a 3.4 5.0 8.0
b 3.0 6.0 8.0
c 4.0 7.0 9.0
'''
df2 = df.pivot('t1','item')
df2
'''
Out[62]:
value
item a b c
t1
11 3.4 3.0 4.0
22 5.0 6.0 7.0
33 8.0 8.0 9.0
'''
# 寬表
test = pd.DataFrame(fake_data, columns=['subject', 'A', 'B', 'C'])
test
'''
subject A B C
0 math 88 70 60
1 english 90 80 78
'''
# 轉換爲長表
pd.melt(test, id_vars=['subject'])
'''
subject variable value
0 math A 88
1 english A 90
2 math B 70
3 english B 80
4 math C 60
5 english C 78
'''