# -*- coding:utf-8 -*-
import pandas as pd
import numpy as np
# DataFrame初始化 numpy.ndarray
df = pd.DataFrame(np.arange(12).reshape(3, 4), columns=["col" + str(i) for i in range(4)], index=list("ABC"))
print(df)
# DataFrame初始化 字典 ,值個數必須一致,其中鍵做爲列名
dic = {"name": ["xiaoming", "xiaohong"], "age": [18, 20], "sex": ["M", "F"]}
df = pd.DataFrame(dic)
print(df)
# DataFrame初始化 列表,缺省值爲NaN,列名爲字典key
dic = [{"name": "xiaoming", "age": 18, "sex": "F"}, {"name": "xiaohong", "age": 20}]
df = pd.DataFrame(dic)
print(df)
print(df.index) # RangeIndex(start=0, stop=2, step=1)
print(df.columns) # Index(['name', 'age', 'sex'], dtype='object')
print(df.values)
print(df.dtypes) # name object / age int64 /sex object
print(df.shape) # (2, 3)
print(type(df.values)) # <class 'numpy.ndarray'>
print(df.ndim) # 表示維度 2
print(df.head()) # 默認前5行
print(df.tail()) # 默認後5行
print("*" * 50)
print(df.info()) # 相關信息概述
print(df.describe()) # 計算數字類型的字段的指標
# 排序
df = df.sort_values(by="age", ascending=False)
print(df)
'''
df = pd.read_csv("test.csv")
print(type(df)) # <class 'pandas.core.frame.DataFrame'>
print(df)
'''
# DataFrame 切片
df = pd.DataFrame(np.random.randint(10, 100, (6, 6)))
df.columns = ["col" + str(i) for i in range(6)]
print(df)
print(df["col1"]) # 取列 pandas.core.series.Series
print(df[:3]) # 取行 pandas.core.frame.DataFrame
# 同時取行取列
print(df[:3][["col1", "col2"]])
print("*" * 50)
# loc 通過標籤索引獲取數據
print(df.loc[0, "col1"])
print(df.loc[[0, 2], ["col1", "col3"]])
print(df.loc[0:2, ["col1", "col3"]]) # 注意:行中冒號後面的也是能選到的,前閉後閉
# iloc 通過位置座標獲取數據
print(df.iloc[1, 1])
print(df.iloc[1:3, 2:3]) # 注意:行中冒號後面的不能選到的,前閉後開
# 數據過濾
print(df[df["col1"] > 50]) # 布爾索引 返回滿足條件的行列數據
df[(df["col1"] > 50) & (df["col1"] < 80)] # 多條件用括號,&鏈接
# 轉換成字符串
df.iloc[:, 1] = "a/b/c"
df["col1"].str.split("/")
# NaN處理
df.iloc[3, 4] = np.nan
pd.isnull(df)
print(df[pd.notnull(df["col4"])]) # col4中不爲NaN的數據
df.dropna(axis=0, how="any") # 刪除存在有NaN的行
df.dropna(axis=0, how="all") # 刪除存整行都是NaN的行
df.fillna(100) # 替換NaN爲100
df.fillna(df.mean()) # 替換NaN爲當前列的平均值
df.iloc[3,:][df.iloc[3,:]==94]=100