R語言數據清洗（數據轉換與啞變量處理）

關注微信公共號：小程在線

關注CSDN博客：程志偉的博客

詳細內容爲《R語言遊戲數據分析與挖掘》第五章學習筆記之數據清洗

5.3.1 數據轉換

5.2.2 啞變量處理

# 產生衍生變量

> rawdata <- read.csv("數據轉換數據.csv",na.strings = NA)
> # 查看數據的前六行
> head(rawdata)
playerid registration firstpaydate days lifetime
1 1001984428 20160408 NA 4 101
2 1002360742 20160407 20160407 12 16
3 1003943907 20160423 NA 1 1
4 100500571 20160406 20160407 10 101
5 1005541598 20160414 NA 1 1
6 1007334849 20160426 NA 2 2

# 將註冊日期變量轉換成日期格式
> rawdata$registration <- as.Date(paste(substr(rawdata$registration,1,4),
+ substr(rawdata$registration,5,6),
+ substr(rawdata$registration,7,8),
+ sep="/"),
+ "%Y/%m/%d")

# 將首次付費日期轉換成日期格式
> rawdata$firstpaydate <- as.Date(paste(substr(rawdata$firstpaydate,1,4),
+ substr(rawdata$firstpaydate,5,6),
+ substr(rawdata$firstpaydate,7,8),
+ sep="/"),
+ "%Y/%m/%d")

# 查看數據的前六行
> head(rawdata)
playerid registration firstpaydate days lifetime
1 1001984428 2016-04-08 <NA> 4 101
2 1002360742 2016-04-07 2016-04-07 12 16
3 1003943907 2016-04-23 <NA> 1 1
4 100500571 2016-04-06 2016-04-07 10 101
5 1005541598 2016-04-14 <NA> 1 1
6 1007334849 2016-04-26 <NA> 2 2

# 增加ispay變量：0表示非付費用戶，1表示付費用戶
> rawdata$ispay <- ifelse(!is.na(rawdata$firstpaydate),1,0)

# 增加isnewpay變量：0表示非新增首日付費用戶，1表示新增首日付費用戶
> rawdata$isnewpay <- ifelse(rawdata$registration==rawdata$firstpaydate,
+ 1,0)
> rawdata[is.na(rawdata$isnewpay),'isnewpay'] <- 0

# 查看數據前6行
> head(rawdata)
playerid registration firstpaydate days lifetime ispay isnewpay
1 1001984428 2016-04-08 <NA> 4 101 0 0
2 1002360742 2016-04-07 2016-04-07 12 16 1 1
3 1003943907 2016-04-23 <NA> 1 1 0 0
4 100500571 2016-04-06 2016-04-07 10 101 1 0
5 1005541598 2016-04-14 <NA> 1 1 0 0
6 1007334849 2016-04-26 <NA> 2 2 0 0

# 5.3.2 數據分箱
> # 利用cut函數對數據進行分箱
> # 對days(活躍天數)進行分箱操作
> rawdata$days_interval <- cut(rawdata$days,
+ breaks=c(0,30,60,90,Inf),
+ labels=c('一個月內','31~60天','61~90天','三個月以上'))
> # 對lifetime(生命週期)進行分箱操作
> rawdata$lifetime_interval <- cut(rawdata$lifetime,
+ breaks=c(0,7,21,30,90,Inf),
+ labels=c('小於一週','小於兩週','小於一個月',
+ '小於三個月','三個月以上'))

# 查看前六行
> head(rawdata)
playerid registration firstpaydate days lifetime ispay isnewpay
1 1001984428 2016-04-08 <NA> 4 101 0 0
2 1002360742 2016-04-07 2016-04-07 12 16 1 1
3 1003943907 2016-04-23 <NA> 1 1 0 0
4 100500571 2016-04-06 2016-04-07 10 101 1 0
5 1005541598 2016-04-14 <NA> 1 1 0 0
6 1007334849 2016-04-26 <NA> 2 2 0 0
days_interval lifetime_interval
1 一個月內三個月以上
2 一個月內小於兩週
3 一個月內小於一週
4 一個月內三個月以上
5 一個月內小於一週
6 一個月內小於一週

# 5.3.3 數據標準化變換

#採用(x-mu)/std的標準化方法，與scale()函數效果一樣
> standard <- preProcess(iris)
> head(predict(standard,iris))
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1 -0.8976739 1.01560199 -1.335752 -1.311052 setosa
2 -1.1392005 -0.13153881 -1.335752 -1.311052 setosa
3 -1.3807271 0.32731751 -1.392399 -1.311052 setosa
4 -1.5014904 0.09788935 -1.279104 -1.311052 setosa
5 -1.0184372 1.24503015 -1.335752 -1.311052 setosa
6 -0.5353840 1.93331463 -1.165809 -1.048667 setosa
> head(scale(iris[,1:4]))
Sepal.Length Sepal.Width Petal.Length Petal.Width
[1,] -0.8976739 1.01560199 -1.335752 -1.311052
[2,] -1.1392005 -0.13153881 -1.335752 -1.311052
[3,] -1.3807271 0.32731751 -1.392399 -1.311052
[4,] -1.5014904 0.09788935 -1.279104 -1.311052
[5,] -1.0184372 1.24503015 -1.335752 -1.311052
[6,] -0.5353840 1.93331463 -1.165809 -1.048667

#採用(x-min(x))/(max(x)-min(x))的標準化方法
> standard <- preProcess(iris, method = 'range')
> head(predict(standard,iris))
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1 0.22222222 0.6250000 0.06779661 0.04166667 setosa
2 0.16666667 0.4166667 0.06779661 0.04166667 setosa
3 0.11111111 0.5000000 0.05084746 0.04166667 setosa
4 0.08333333 0.4583333 0.08474576 0.04166667 setosa
5 0.19444444 0.6666667 0.06779661 0.04166667 setosa
6 0.30555556 0.7916667 0.11864407 0.12500000 setosa
> fun <- function(x) (x-min(x))/(max(x)-min(x))
> head(sapply(iris[,1:4],fun))
Sepal.Length Sepal.Width Petal.Length Petal.Width
[1,] 0.22222222 0.6250000 0.06779661 0.04166667
[2,] 0.16666667 0.4166667 0.06779661 0.04166667
[3,] 0.11111111 0.5000000 0.05084746 0.04166667
[4,] 0.08333333 0.4583333 0.08474576 0.04166667
[5,] 0.19444444 0.6666667 0.06779661 0.04166667
[6,] 0.30555556 0.7916667 0.11864407 0.12500000

# 5.4 數據啞變量處理
> # 構建customers數據集
> customers<-data.frame(id=c(10,20,30,40,50),
+ gender=c("male","female","female","male","female"),
+ mood=c("happy","sad","happy","sad","happy"),
+ outcome=c(1,1,0,0,0))
> customers
id gender mood outcome
1 10 male happy 1
2 20 female sad 1
3 30 female happy 0
4 40 male sad 0
5 50 female happy 0

# 對因子型變量進行啞變量處理
> # 創建新數據框customers.new
> customers.new <- customers[,c('id','outcome')]

# 對gender變量進行啞變量處理
> customers.new$gender.male <- ifelse(customers$gender=='male',1,0)
> customers.new$gender.female <- ifelse(customers$gender=='female',1,0)

# 對mood變量進行啞變量處理
> customers.new$mood.happy <- ifelse(customers$mood=='happy',1,0)
> customers.new$mood.sad <- ifelse(customers$mood=='sad',1,0)
> customers.new
id outcome gender.male gender.female mood.happy mood.sad
1 10 1 1 0 1 0
2 20 1 0 1 0 1
3 30 0 0 1 1 0
4 40 0 1 0 0 1
5 50 0 0 1 1 0

# 加載caret包到內存
> library(caret)
> # 查看customers的數據結構
> str(customers)
'data.frame': 5 obs. of 4 variables:
$ id : num 10 20 30 40 50
$ gender : Factor w/ 2 levels "female","male": 2 1 1 2 1
$ mood : Factor w/ 2 levels "happy","sad": 1 2 1 2 1
$ outcome: num 1 1 0 0 0

# 利用dummyVars函數對customers數據進行啞變量處理
> dmy<-dummyVars(~.,data=customers)

# 對自身變量進行預測，並轉換成data.frame格式
> trsf<-data.frame(predict(dmy,newdata=customers))

# 查看轉換結果
> trsf
id gender.female gender.male mood.happy mood.sad outcome
1 10 0 1 1 0 1
2 20 1 0 0 1 1
3 30 1 0 1 0 0
4 40 0 1 0 1 0
5 50 1 0 1 0 0

# 將outcome變量轉換成因子型變量
> customers$outcome <- as.factor(customers$outcome)

# 利用dummyVars函數對customers數據進行啞變量處理
> dmy<-dummyVars(~.,data=customers)

# 對自身變量進行預測，並轉換成data.frame格式
> trsf<-data.frame(predict(dmy,newdata=customers))

# 查看轉換結果
> trsf
id gender.female gender.male mood.happy mood.sad outcome.0 outcome.1
1 10 0 1 1 0 0 1
2 20 1 0 0 1 0 1
3 30 1 0 1 0 1 0
4 40 0 1 0 1 1 0
5 50 1 0 1 0 1 0

# 只對gender變量進行啞變量轉換
> dmy.gender <- dummyVars(~gender,data=customers)
> trsf.gender <- data.frame(predict(dmy.gender,newdata=customers))
> trsf.gender
gender.female gender.male
1 0 1
2 1 0
3 1 0
4 0 1
5 1 0

# 將levelsOnly和fullRank設置爲TRUE
> customers<-data.frame(id=c(10,20,30,40,50),
+ gender=c("male","female","female","male","female"),
+ mood=c("happy","sad","happy","sad","happy"),
+ outcome=c(1,1,0,0,0))
> dmy<-dummyVars(~.,data=customers,levelsOnly=TRUE,fullRank=TRUE)
> trsf<-data.frame(predict(dmy,newdata=customers))
> trsf
id male sad outcome
1 10 1 0 1
2 20 0 1 1
3 30 0 0 0
4 40 1 1 0
5 50 0 0 0

R語言數據清洗（數據轉換與啞變量處理）

.NET有哪些好用的定時任務調度框架

Python 將PDF轉爲PDF/A、PDF/X，以及PDF/A轉回PDF

elk3

Kafka存儲機制

aws語音呼叫調用，告警電話

深度學習框架火焰圖pprof和CUDA Nsys配置指南

【轉】[C#] WebAPI 防止併發調用二（冥等性）

爬蟲兩種繞過5s盾的方法

【轉】[SQL Server]關掉 SSMS 的 IntelliSense

號稱能打敗MLP的KAN到底行不行？數學核心原理全面解析

Kettle 安裝與簡單案例介紹

GIT 史上最詳細Git使用教程

Julia（未來可能替代Python與R語言）數據抽樣與結果評價

mysql 免安裝版本

R語言兩種方法連接oracle以及將處理後的數據導入數據庫中

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結

R語言 數據清洗（數據轉換與啞變量處理）

R語言數據清洗（數據轉換與啞變量處理）