通过基数R的另一种想法是将值粘贴到其先前的值(逐行),转换为因数以获得所有9个级别(使用expand.grid
仅包含您想要的级别-还可以处理NA),最后通过table
对值进行计数。最后一步是将ID与结果绑定,即
cbind.data.frame(df$userID,t(apply(df[-c(1:2)],1,function(i) {
i1 <- paste(i[-length(i)],i[-1]);
i1 <- factor(i1,levels = do.call(paste,expand.grid(c('Easy','Match','Hard'),c('Easy','Hard'))));
table(i1) })))
给出,
df$userID Easy Easy Match Easy Hard Easy Easy Match Match Match Hard Match Easy Hard Match Hard Hard Hard
1 3108 3 0 0 0 0 0 0 0 0
2 3207 0 0 1 1 1 0 0 0 0
3 3350 0 0 1 0 0 0 1 0 1
4 3961 0 0 0 0 0 0 0 0 1
5 4021 1 0 0 0 0 0 0 0 0
,
另一个类似于Sotos的方法,但是1)使用data.table
,2)不使用factor
和3)将table
替换为Rfast::rowTabulate
:
v <- c('Hard','Easy')
vv <- do.call(paste,expand.grid(v,v))
DT[,(vv) := {
mat <- mapply(paste,.SD[,-ncol(.SD),with=FALSE],-1L])
as.data.table(Rfast::rowTabulate(matrix(match(mat,vv,0L),nrow=.N)))
},.SDcols=Task_Alpha:Task_Delta]
输出:
userID Score Task_Alpha Task_Beta Task_Charlie Task_Delta Hard Hard Match Hard Easy Hard Hard Match Match Match Easy Match Hard Easy Match Easy Easy Easy
1: 3108 -8.00 Easy Easy Easy Easy 0 0 0 0 0 0 0 0 3
2: 3207 3.00 Hard Easy Match Match 0 0 0 0 1 1 1 0 0
3: 3350 5.78 Hard Easy Hard Hard 1 0 1 0 0 0 1 0 0
4: 3961 10.00 Easy <NA> Hard Hard 1 0 0 0 0 0 0 0 0
5: 4021 10.00 Easy Easy <NA> Hard 0 0 0 0 0 0 0 0 1
数据:
library(data.table)
library(Rfast)
DT <- structure(list(
userID = c(3108L,3207L,3350L,3961L,4021L),Score = c(-8,3,5.78,10,10),Task_Alpha = structure(c(1L,2L,1L,1L),.Label = c("Easy","Hard"),class = "factor"),Task_Beta = structure(c(1L,NA,.Label = "Easy",Task_Charlie = structure(c(1L,3L,NA),"Hard","Match"),Task_Delta = structure(c(1L,2L),class = "factor")),class = "data.frame",row.names = c(NA,-5L))
setDT(DT)
了解这种方法在实际数据集上的运行速度以及实际数据集是否很大将很有趣。
编辑:添加了一些时间
library(data.table)
nr <- 1e6
vec <- c('Hard','Easy',NA)
DT <- data.table(userID=1:nr,Task_Alpha=sample(vec,nr,TRUE),Task_Beta=sample(vec,Task_Charlie=sample(vec,Task_Delta=sample(vec,TRUE))
df <- as.data.frame(DT)
DT0 <- copy(DT)
DT1 <- copy(DT)
DT2 <- copy(DT)
mtd0 <- function() {
t(apply(df[-1L],function(i) {
i1 <- paste(i[-length(i)],i[-1L]);
i1 <- factor(i1,'Hard'))));
table(i1)
}))
}
mtd1 <- function() {
f_cols <- names(DT0)[ sapply( DT0,is.factor ) ]
DT0[,(f_cols) := lapply(.SD,as.character),.SDcols = f_cols ]
#melt to long format
DT.melt <- melt( DT0,id.vars = "userID",measure.vars = patterns( task = "^Task_"))
#set order of Aplha-Beta-etc...
DT.melt[ grepl( "Alpha",variable ),order := 1 ]
DT.melt[ grepl( "Beta",order := 2 ]
DT.melt[ grepl( "Charlie",order := 3 ]
DT.melt[ grepl( "Delta",order := 4 ]
#order DT.melt
setorder( DT.melt,userID,order )
#fill in codes EE,etc...
DT.melt[,`:=`( code1 = gsub( "(^.).*","\\1",value ),code2 = gsub( "(^.).*",shift( value,type = "lead" ) ) ),by = userID ]
#filter only rows without NA
DT.melt <- DT.melt[ complete.cases( DT.melt ) ]
#cast to wide output
dcast( DT.melt,userID ~ paste0( code2,code1 ),fun.aggregate = length )
}
mtd2 <- function() {
v <- c('Hard','Easy')
vv <- do.call(paste,v))
DT2[,.SDcols=Task_Alpha:Task_Delta]
}
bench::mark(mtd0(),mtd1(),mtd2(),check=FALSE)
时间:
# A tibble: 3 x 13
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
1 mtd0() 2.19m 2.19m 0.00760 252MB 2.26 1 297 2.19m <int[,9] [1,000,000 x 9]> <df[,3] [171,481 x 3]> <bch:tm> <tibble [1 x 3]>
2 mtd1() 33.16s 33.16s 0.0302 856MB 0.754 1 25 33.16s <df[,10] [843,688 x 10]> <df[,3] [8,454 x 3]> <bch:tm> <tibble [1 x 3]>
3 mtd2() 844.95ms 844.95ms 1.18 298MB 1.18 1 1 844.95ms <df[,14] [1,000 x 14]> <df[,912 x 3]> <bch:tm> <tibble [1 x 3]>
,
library(data.table)
#set df to data.table
setDT(df)
#convert factor-columns to character
f_cols <- names(df)[ sapply( df,is.factor ) ]
df[,.SDcols = f_cols ]
#melt to long format
DT.melt <- melt( df,measure.vars = patterns( task = "^Task_"),variable.name = grep("^Task",names(df),value = TRUE) )
#set order of Aplha-Beta-etc...
DT.melt[ grepl( "Alpha",order := 1 ]
DT.melt[ grepl( "Beta",order := 2 ]
DT.melt[ grepl( "Charlie",order := 3 ]
DT.melt[ grepl( "Delta",order := 4 ]
#order DT.melt
setorder( DT.melt,order )
#fill in codes EE,etc...
DT.melt[,by = userID ]
#filter only rows without NA
DT.melt <- DT.melt[ complete.cases( DT.melt ) ]
str(DT.melt)
#cast to wide output
dcast( DT.melt,fun.aggregate = length )
# userID EE EH EM HE HH MM
# 1: 3108 3 0 0 0 0 0
# 2: 3207 0 0 1 1 0 1
# 3: 3350 0 1 0 1 1 0
# 4: 3961 0 0 0 0 1 0
# 5: 4021 1 0 0 0 0 0
,
涉及dplyr
和tidyr
的一个选项可能是:
df %>%
select(-Score) %>%
pivot_longer(names_to = "variables",values_to = "values",-userID) %>%
select(-variables) %>%
group_by(userID) %>%
filter(!is.na(values) & !is.na(lag(values,default = first(values)))) %>%
mutate(variables = paste(values,lead(values),sep = "-")) %>%
filter(row_number() != n()) %>%
count(variables) %>%
ungroup() %>%
pivot_wider(names_from = "variables",values_from = "n",values_fill = list(n = 0))
userID `Easy-Easy` `Easy-Match` `Hard-Easy` `Match-Match` `Easy-Hard` `Hard-Hard`
<int> <int> <int> <int> <int> <int> <int>
1 3108 3 0 0 0 0 0
2 3207 0 1 1 1 0 0
3 3350 0 0 1 0 1 1
4 3961 0 0 0 0 1 0
5 4021 1 0 0 0 0 0
首先,它将数据从宽格式转换为长格式。其次,它按userID
分组,排除缺失值和滞后缺失值。第三,它将当前值和潜在值连接起来。第四,它计算每个userID
中给定组合的出现。最后,它将数据转换为宽格式。
如果您还想要不存在的组合:
x <- c("Easy","Match")
df %>%
select(-Score) %>%
pivot_longer(names_to = "variables",sep = "-")) %>%
filter(row_number() != n()) %>%
count(variables) %>%
complete(variables = c(outer(x,x,FUN = paste,sep = "-")),fill = list(n = 0)) %>%
ungroup() %>%
pivot_wider(names_from = "variables",values_from = "n")