關注微信公共號:小程在線
關注CSDN博客:程志偉的博客
詳細內容爲 《R語言遊戲數據分析與挖掘》第五章學習筆記之數據清洗
5.3.1 數據轉換
5.2.2 啞變量處理
# 產生衍生變量
> rawdata <- read.csv("數據轉換數據.csv",na.strings = NA)
> # 查看數據的前六行
> head(rawdata)
playerid registration firstpaydate days lifetime
1 1001984428 20160408 NA 4 101
2 1002360742 20160407 20160407 12 16
3 1003943907 20160423 NA 1 1
4 100500571 20160406 20160407 10 101
5 1005541598 20160414 NA 1 1
6 1007334849 20160426 NA 2 2
# 將註冊日期變量轉換成日期格式
> rawdata$registration <- as.Date(paste(substr(rawdata$registration,1,4),
+ substr(rawdata$registration,5,6),
+ substr(rawdata$registration,7,8),
+ sep="/"),
+ "%Y/%m/%d")
# 將首次付費日期轉換成日期格式
> rawdata$firstpaydate <- as.Date(paste(substr(rawdata$firstpaydate,1,4),
+ substr(rawdata$firstpaydate,5,6),
+ substr(rawdata$firstpaydate,7,8),
+ sep="/"),
+ "%Y/%m/%d")
# 查看數據的前六行
> head(rawdata)
playerid registration firstpaydate days lifetime
1 1001984428 2016-04-08 <NA> 4 101
2 1002360742 2016-04-07 2016-04-07 12 16
3 1003943907 2016-04-23 <NA> 1 1
4 100500571 2016-04-06 2016-04-07 10 101
5 1005541598 2016-04-14 <NA> 1 1
6 1007334849 2016-04-26 <NA> 2 2
# 增加ispay變量:0表示非付費用戶,1表示付費用戶
> rawdata$ispay <- ifelse(!is.na(rawdata$firstpaydate),1,0)
# 增加isnewpay變量:0表示非新增首日付費用戶,1表示新增首日付費用戶
> rawdata$isnewpay <- ifelse(rawdata$registration==rawdata$firstpaydate,
+ 1,0)
> rawdata[is.na(rawdata$isnewpay),'isnewpay'] <- 0
# 查看數據前6行
> head(rawdata)
playerid registration firstpaydate days lifetime ispay isnewpay
1 1001984428 2016-04-08 <NA> 4 101 0 0
2 1002360742 2016-04-07 2016-04-07 12 16 1 1
3 1003943907 2016-04-23 <NA> 1 1 0 0
4 100500571 2016-04-06 2016-04-07 10 101 1 0
5 1005541598 2016-04-14 <NA> 1 1 0 0
6 1007334849 2016-04-26 <NA> 2 2 0 0
# 5.3.2 數據分箱
> # 利用cut函數對數據進行分箱
> # 對days(活躍天數)進行分箱操作
> rawdata$days_interval <- cut(rawdata$days,
+ breaks=c(0,30,60,90,Inf),
+ labels=c('一個月內','31~60天','61~90天','三個月以上'))
> # 對lifetime(生命週期)進行分箱操作
> rawdata$lifetime_interval <- cut(rawdata$lifetime,
+ breaks=c(0,7,21,30,90,Inf),
+ labels=c('小於一週','小於兩週','小於一個月',
+ '小於三個月','三個月以上'))
# 查看前六行
> head(rawdata)
playerid registration firstpaydate days lifetime ispay isnewpay
1 1001984428 2016-04-08 <NA> 4 101 0 0
2 1002360742 2016-04-07 2016-04-07 12 16 1 1
3 1003943907 2016-04-23 <NA> 1 1 0 0
4 100500571 2016-04-06 2016-04-07 10 101 1 0
5 1005541598 2016-04-14 <NA> 1 1 0 0
6 1007334849 2016-04-26 <NA> 2 2 0 0
days_interval lifetime_interval
1 一個月內 三個月以上
2 一個月內 小於兩週
3 一個月內 小於一週
4 一個月內 三個月以上
5 一個月內 小於一週
6 一個月內 小於一週
# 5.3.3 數據標準化變換
#採用(x-mu)/std的標準化方法,與scale()函數效果一樣
> standard <- preProcess(iris)
> head(predict(standard,iris))
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1 -0.8976739 1.01560199 -1.335752 -1.311052 setosa
2 -1.1392005 -0.13153881 -1.335752 -1.311052 setosa
3 -1.3807271 0.32731751 -1.392399 -1.311052 setosa
4 -1.5014904 0.09788935 -1.279104 -1.311052 setosa
5 -1.0184372 1.24503015 -1.335752 -1.311052 setosa
6 -0.5353840 1.93331463 -1.165809 -1.048667 setosa
> head(scale(iris[,1:4]))
Sepal.Length Sepal.Width Petal.Length Petal.Width
[1,] -0.8976739 1.01560199 -1.335752 -1.311052
[2,] -1.1392005 -0.13153881 -1.335752 -1.311052
[3,] -1.3807271 0.32731751 -1.392399 -1.311052
[4,] -1.5014904 0.09788935 -1.279104 -1.311052
[5,] -1.0184372 1.24503015 -1.335752 -1.311052
[6,] -0.5353840 1.93331463 -1.165809 -1.048667
#採用(x-min(x))/(max(x)-min(x))的標準化方法
> standard <- preProcess(iris, method = 'range')
> head(predict(standard,iris))
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1 0.22222222 0.6250000 0.06779661 0.04166667 setosa
2 0.16666667 0.4166667 0.06779661 0.04166667 setosa
3 0.11111111 0.5000000 0.05084746 0.04166667 setosa
4 0.08333333 0.4583333 0.08474576 0.04166667 setosa
5 0.19444444 0.6666667 0.06779661 0.04166667 setosa
6 0.30555556 0.7916667 0.11864407 0.12500000 setosa
> fun <- function(x) (x-min(x))/(max(x)-min(x))
> head(sapply(iris[,1:4],fun))
Sepal.Length Sepal.Width Petal.Length Petal.Width
[1,] 0.22222222 0.6250000 0.06779661 0.04166667
[2,] 0.16666667 0.4166667 0.06779661 0.04166667
[3,] 0.11111111 0.5000000 0.05084746 0.04166667
[4,] 0.08333333 0.4583333 0.08474576 0.04166667
[5,] 0.19444444 0.6666667 0.06779661 0.04166667
[6,] 0.30555556 0.7916667 0.11864407 0.12500000
# 5.4 數據啞變量處理
> # 構建customers數據集
> customers<-data.frame(id=c(10,20,30,40,50),
+ gender=c("male","female","female","male","female"),
+ mood=c("happy","sad","happy","sad","happy"),
+ outcome=c(1,1,0,0,0))
> customers
id gender mood outcome
1 10 male happy 1
2 20 female sad 1
3 30 female happy 0
4 40 male sad 0
5 50 female happy 0
# 對因子型變量進行啞變量處理
> # 創建新數據框customers.new
> customers.new <- customers[,c('id','outcome')]
# 對gender變量進行啞變量處理
> customers.new$gender.male <- ifelse(customers$gender=='male',1,0)
> customers.new$gender.female <- ifelse(customers$gender=='female',1,0)
# 對mood變量進行啞變量處理
> customers.new$mood.happy <- ifelse(customers$mood=='happy',1,0)
> customers.new$mood.sad <- ifelse(customers$mood=='sad',1,0)
> customers.new
id outcome gender.male gender.female mood.happy mood.sad
1 10 1 1 0 1 0
2 20 1 0 1 0 1
3 30 0 0 1 1 0
4 40 0 1 0 0 1
5 50 0 0 1 1 0
# 加載caret包到內存
> library(caret)
> # 查看customers的數據結構
> str(customers)
'data.frame': 5 obs. of 4 variables:
$ id : num 10 20 30 40 50
$ gender : Factor w/ 2 levels "female","male": 2 1 1 2 1
$ mood : Factor w/ 2 levels "happy","sad": 1 2 1 2 1
$ outcome: num 1 1 0 0 0
# 利用dummyVars函數對customers數據進行啞變量處理
> dmy<-dummyVars(~.,data=customers)
# 對自身變量進行預測,並轉換成data.frame格式
> trsf<-data.frame(predict(dmy,newdata=customers))
# 查看轉換結果
> trsf
id gender.female gender.male mood.happy mood.sad outcome
1 10 0 1 1 0 1
2 20 1 0 0 1 1
3 30 1 0 1 0 0
4 40 0 1 0 1 0
5 50 1 0 1 0 0
# 將outcome變量轉換成因子型變量
> customers$outcome <- as.factor(customers$outcome)
# 利用dummyVars函數對customers數據進行啞變量處理
> dmy<-dummyVars(~.,data=customers)
# 對自身變量進行預測,並轉換成data.frame格式
> trsf<-data.frame(predict(dmy,newdata=customers))
# 查看轉換結果
> trsf
id gender.female gender.male mood.happy mood.sad outcome.0 outcome.1
1 10 0 1 1 0 0 1
2 20 1 0 0 1 0 1
3 30 1 0 1 0 1 0
4 40 0 1 0 1 1 0
5 50 1 0 1 0 1 0
# 只對gender變量進行啞變量轉換
> dmy.gender <- dummyVars(~gender,data=customers)
> trsf.gender <- data.frame(predict(dmy.gender,newdata=customers))
> trsf.gender
gender.female gender.male
1 0 1
2 1 0
3 1 0
4 0 1
5 1 0
# 將levelsOnly和fullRank設置爲TRUE
> customers<-data.frame(id=c(10,20,30,40,50),
+ gender=c("male","female","female","male","female"),
+ mood=c("happy","sad","happy","sad","happy"),
+ outcome=c(1,1,0,0,0))
> dmy<-dummyVars(~.,data=customers,levelsOnly=TRUE,fullRank=TRUE)
> trsf<-data.frame(predict(dmy,newdata=customers))
> trsf
id male sad outcome
1 10 1 0 1
2 20 0 1 1
3 30 0 0 0
4 40 1 1 0
5 50 0 0 0