以下幾種pandas描述性感覺容易混淆,記錄下
1,pandas.DataFrame.count
DataFrame.count(self, axis=0, level=None, numeric_only=False)
#統計爲每列或每行非NA數
# 參數
"""
axis : {0 or ‘index’, 1 or ‘columns’}, 默認爲 0
level : int 或 str, 對於分層索引指定層級
numeric_only : boolean, 默認爲 False 如果爲True 僅包含float,int或boolean數據
"""
#demo
df = pd.DataFrame({'k1': list('aabba'),
'k2': ['one', 'two', 'one', 'two', 'one'],
'value1': [np.nan, 2, np.nan, 3, 4],
'value2': [10, 20, np.nan, np.nan, 40]})
"""
k1 k2 value1 value2
0 a one NaN 10.0
1 a two 2.0 20.0
2 b one NaN NaN
3 b two 3.0 NaN
4 a one 4.0 40.0
"""
print(df.count(axis=0))
""" 統計每列非NA數
k1 5
k2 5
value1 3
value2 3
dtype: int64
"""
print(df.set_index(['k1', 'k2']).count(level=1))
""" 分層索引 按K2統計非NA數
value1 value2
k2
one 1 2
two 2 1
"""
print(df.groupby(['k1', 'k2']).count())
"""groupby也有同樣的方法,統計組內非NA數,無任何參數
value1 value2
k1 k2
a one 1 2
two 1 1
b one 0 0
two 1 0
"""
print(df['k1'].count())
""" Series 也有同樣的方法
5
"""
2,pandas.Series.value_counts
Series.value_counts(self, normalize=False, sort=True, ascending=False, bins=None, dropna=True)
"""
對Series裏面的每個值進行計數並且排序,
結果對象將按降序排列,默認情況下不包括NA值。
"""
#參數
"""
normalize:bool, 默認False 如果爲True,返回頻率佔比
sort:bool, default True,按照頻率排序
ascending:bool, default False 降序排序
bins:不是計數,而是將它們分組到半開的容器中,與pd.cut類似,只針對數值
dropna:bool, default True,忽略NAN
"""
#demo
df = pd.DataFrame({'k1': list('aabba'),
'k2': ['one', 'two', 'one', 'two', 'one'],
'value1': [11, 20, 21, 21, 41],
'value2': [11, 20, 21, 31, 40]})
print(df['k2'].value_counts(ascending=True,normalize=True))
""" 使用頻率佔比
two 0.4
one 0.6
Name: k2, dtype: float64
"""
print(df['value2'].value_counts(bins=2))
""" 分層兩個容器
(10.97, 25.5] 3
(25.5, 40.0] 2
Name: value2, dtype: int64
"""
print(df[['value1', 'value2']].apply(pd.value_counts))
"""DataFrame通過apply調用value_counts
value1 value2
11 1.0 1.0
20 1.0 1.0
21 2.0 1.0
31 NaN 1.0
40 NaN 1.0
41 1.0 NaN
"""
3,pandas.DataFrame.size
DataFrame.size
# 返回表示此對象中元素數
#demo
df = pd.DataFrame({'k1': list('aabba'),
'k2': ['one', 'two', 'one', 'two', 'one'],
'value1': [np.nan, 2, np.nan, 3, 4],
'value2': [10, 20, np.nan, np.nan, 40]})
print(df)
print(df.size) # 20
"""
size 與 count()不同
1,size是property,count是method
2,size統計整個df或者series的元素數,而count()可以按行或列統計
3,size包含NAN,而count()可以排除NAN
"""
# 在groupby中size計數時包含NaN值,而count不包含NaN值
print(df.groupby(['k1', 'k2'])['value1'].size())
"""
k1 k2
a one 2
two 1
b one 1
two 1
Name: value1, dtype: int64
"""
print(df.groupby(['k1', 'k2'])['value1'].count())
"""k1 k2
a one 1
two 1
b one 0
two 1
Name: value1, dtype: int64
"""
4,pandas.Series.unique
Series.unique(self)
# 返回Series每個特徵的唯一值,也返回去重後剩餘的值
# demo
df = pd.DataFrame({'k1': list('aabba'),
'k2': ['one', 'two', 'one', 'two', 'one'],
'value1': [11, 20, 21, 21, 41],
'value2': [11, 20, 21, 31, 40]})
print(df)
print(df['value1'].unique()) # [11 20 21 41]
5,pandas.DataFrame.nunique
DataFrame.nunique(self, axis=0, dropna=True)
# 返回沿指定軸上唯一值的個數,忽略NAN,也返回重後剩餘值的個數
#參數
"""
axis:軸向 默認0軸
dropna:bool, default True,忽略NAN
"""
# demo
df = pd.DataFrame({'k1': list('aabba'),
'k2': ['one', 'two', 'one', 'two', 'one'],
'value1': [11, 20, 21, np.nan, 41],
'value2': [11, np.nan, 21, np.nan, 40]})
print(df.nunique(axis=0))
"""
k1 2 ['a','b'] 兩個值
k2 2 ['one','two'] 兩個值
value1 4 [11.20,21,41]
value2 3 [11,21,40]
dtype: int64
"""
print(df['k1'].nunique()) # Series 也有nunique(),不過無axis參數