一. 查看每列的數據結構
def print_col_info(dataset):
'''print info of every column in dataset:
detailed info includes:
1, values
2, value type num'''
col_num=dataset.shape[1]
for i in range(col_num):
print('\ncol-{} info: '.format(i))
temp=np.sort(list(set(dataset[:,i])))
print('values: {}'.format(temp))
print('values num: {}'.format(temp.shape[0]))
二. 通過map函數對離散值的字符串統一去掉空格
# 具有離散值列的索引list
str_cols=[1,3,5,6,7,8,9,13,14]
for col in str_cols:
df.iloc[:,col]=df.iloc[:,col].map(lambda x: x.strip())
map函數的用法:接受函數爲參數,或把函數作爲返回結果的函數
def square(x): return x * x
xx = map(square, range(10))
xx = list(xx)
三. 查看數據是否有缺失值
# 查看數據是否有缺失值--任意列,全部列
df.isnull().any()
df.isnull().all()
四. 將某固定值替換爲nan,再刪除nan數據
# 無返回值,直接替換
# 將?字符串替換爲NaN缺失值標誌
df.replace("?",np.nan,inplace=True)
# 此處直接刪除缺失值樣本
df.dropna(inplace=True)
將每列的0替換爲None
# 先檢查是否有'真正的空值'
pima['serum_insulin'].isnull().sum()
# 用None來手動的替換0
pima['serum_insulin'] = pima['serum_insulin'].map(lambda x:x if x != 0 else None)
# 再次檢查缺失值數量
pima['serum_insulin'].isnull().sum()
五. 對離散型數據進行編碼
from sklearn import preprocessing
label_encoder=[] # 放置每一列的encoder
encoded_set = np.empty(df.shape)
for col in range(df.shape[1]):
encoder=None
if df.iloc[:,col].dtype==object: # 字符型數據
encoder=preprocessing.LabelEncoder()
encoded_set[:,col]=encoder.fit_transform(df.iloc[:,col])
else: # 數值型數據
encoded_set[:,col]=df.iloc[:,col]
label_encoder.append(encoder)
六. 通過describe查看連續型數據的min和max值,對相差較大的列進行範圍縮放
df.describe()
cols=[2,10,11]
data_scalers=[] # 專門用來放置scaler
for col in cols:
data_scaler=preprocessing.MinMaxScaler(feature_range=(-1,1))
encoded_set[:,col]=np.ravel(data_scaler.fit_transform(encoded_set[:,col].reshape(-1,1)))
data_scalers.append(data_scaler)
七. 對傳入的模型計算準確度、精確度、召回率和F1 score;
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
def score_cal(model, test_X, test_y):
num_validations=5
accuracy=cross_val_score(model,test_X,test_y,scoring='accuracy',cv=num_validations)
print('準確率:{:.2f}%'.format(accuracy.mean()*100))
precision=cross_val_score(model,test_X,test_y,scoring='precision_weighted',cv=num_validations)
print('精確度:{:.2f}%'.format(precision.mean()*100))
recall=cross_val_score(model,test_X,test_y,scoring='recall_weighted',cv=num_validations)
print('召回率:{:.2f}%'.format(recall.mean()*100))
f1=cross_val_score(model,test_X,test_y,scoring='f1_weighted',cv=num_validations)
print('F1 值:{:.2f}%'.format(f1.mean()*100))
# 3 打印性能報告
y_pred=model.predict(test_X)
confusion_mat = confusion_matrix(test_y, y_pred)
print(confusion_mat) #看看混淆矩陣長啥樣
# 直接使用sklearn打印精度,召回率和F1值
target_names = ['<=50K', '>50K']
print(classification_report(test_y, y_pred,target_names=target_names))
八. sklearn隨機分離測試集和訓練集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 666)
九.sklearn的網格搜索Grid Search
## knn的網格搜索
param_grid = {
{
'weights':['uniform'],
'n_neighbors':[i for i in range(1, 11)]
},
{
'weights':['distance'],
'n_neighbors':[i for i in range(1, 11)],
'p':[i for i in range(1,6)]
}
}
十.定義函數來搜索所有給定的參數,通過指標來優化機器學習流水線
# 導入網絡搜索模塊
from sklearn.model_selection import GridSearchCV
def get_best_model_accuracy(model, params, X, y):
grid = GridSearchCV(model, # 要搜索的模型
params, # 要嘗試的參數
error_score=0.) # 如果報錯,結果爲0
grid.fit(X, y) # 擬合模型和參數
# 經典的性能指標
print("Best Accuracy: {}".format(grid.best_score_))
# 得到最佳準確率的最佳參數
print("Best Parameters: {}".format(grid.best_params_))
# 擬合的平均時間(秒)
print("Average Time to Fit(s): {}".format(grid.cv_results_['mean_fit_time'].mean(), 3))
# 預測的平均時間(秒)
# 從該指標可以看出模型在真實世界的性能
print("Average Time to Score (s): {}".format(round(grid.cv_results_['mean_score_time'].mean(), 3)))
十一. Dataframe遍歷對每行值進行改變
def Ticket_First_Let(x):
return x[0]
X_train['Ticket_First_Letter'] = X_train['Ticket'].apply(Ticket_First_Let)
X_test['Ticket_First_Letter'] = X_test['Ticket'].apply(Ticket_First_Let)