import pandas as pd
import sklearn
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
titanic = pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt')
#選取一些特徵作爲我們劃分的依據
x = titanic[['pclass', 'age', 'sex']]
y = titanic['survived']
# 填充缺失值
x['age'].fillna(x['age'].mean(), inplace=True)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)
dt = DictVectorizer(sparse=False)
print(x_train.to_dict(orient="record"))
# 按行,樣本名字爲鍵,列名也爲鍵,[{"1":1,"2":2,"3":3}]
x_train = dt.fit_transform(x_train.to_dict(orient="record"))
x_test = dt.fit_transform(x_test.to_dict(orient="record"))
# 使用決策樹
dtc = DecisionTreeClassifier()
dtc.fit(x_train, y_train)
dt_predict = dtc.predict(x_test)
print(dtc.score(x_test, y_test))
print(classification_report(y_test, dt_predict, target_names=["died", "survived"]))
# 使用隨機森林
rfc = RandomForestClassifier()
rfc.fit(x_train, y_train)
rfc_y_predict = rfc.predict(x_test)
print(rfc.score(x_test, y_test))
print(classification_report(y_test, rfc_y_predict, target_names=["died", "survived"]))
泰坦尼克號乘客-------------判斷乘客生還
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.