如何按有序顺序计算因子

我有一个数据框df：

userID Score  Task_Alpha Task_Beta Task_Charlie Task_Delta 
3108  -8.00   Easy       Easy      Easy         Easy    
3207   3.00   Hard       Easy      Match        Match
3350   5.78   Hard       Easy      Hard         Hard
3961   10.00  Easy       NA        Hard         Hard
4021   10.00  Easy       Easy      NA           Hard


1. userID is factor variable
2. Score is numeric
3. All the 'Task_' features are factor variables with possible values 'Hard','Easy','Match' or NA

我想计算Task_功能之间的可能过渡。供参考，可能的转换是：

EE transition from Easy -> Easy
EM transition from Easy -> Match
EH transition from Easy -> Hard
ME transition from Match-> Easy
MM transition from Match-> Match
MH transition from Match-> Hard
HE transition from Hard -> Easy
HM transition from Hard -> Match
HH transition from Hard -> Hard

由于存在三个可能的值（不包括NA情况），因此输出列如下：

userID  EE  EM  EH  MM  ME  MH  HH  HE  HM
3108    3   0   0   0   0   0   0   0   0
3207    0   1   0   1   0   0   0   1   0
3350    0   0   1   0   0   0   1   1   0
3961    0   0   0   0   0   0   1   0   0
4021    1   0   0   0   0   0   0   0   0

1）在此示例中，每个用户ID最多可以具有3个状态转换。

2）请注意，对于用户3961和4021，NA减少了可能的状态转换。

对于这些问题的任何建议，将不胜感激。

数据dput()为：

df <- structure(list(
userID = c(3108L,3207L,3350L,3961L,4021L),Score = c(-8,3,5.78,10,10),Task_Alpha = structure(c(1L,2L,1L,1L),.Label = c("Easy","Hard"),class = "factor"),Task_Beta = structure(c(1L,NA,.Label = "Easy",Task_Charlie = structure(c(1L,3L,NA),"Hard","Match"),Task_Delta = structure(c(1L,2L),class = "factor")),class = "data.frame",row.names = c(NA,-5L))

通过基数R的另一种想法是将值粘贴到其先前的值（逐行），转换为因数以获得所有9个级别（使用expand.grid仅包含您想要的级别-还可以处理NA），最后通过table对值进行计数。最后一步是将ID与结果绑定，即

cbind.data.frame(df$userID,t(apply(df[-c(1:2)],1,function(i) { 
                          i1 <- paste(i[-length(i)],i[-1]); 
                          i1 <- factor(i1,levels = do.call(paste,expand.grid(c('Easy','Match','Hard'),c('Easy','Hard')))); 
                         table(i1) })))

给出，

  df$userID Easy Easy Match Easy Hard Easy Easy Match Match Match Hard Match Easy Hard Match Hard Hard Hard
1      3108         3          0         0          0           0          0         0          0         0
2      3207         0          0         1          1           1          0         0          0         0
3      3350         0          0         1          0           0          0         1          0         1
4      3961         0          0         0          0           0          0         0          0         1
5      4021         1          0         0          0           0          0         0          0         0

另一个类似于Sotos的方法，但是1）使用data.table，2）不使用factor和3）将table替换为Rfast::rowTabulate：

v <- c('Hard','Easy')
vv <- do.call(paste,expand.grid(v,v))
DT[,(vv) := {
        mat <- mapply(paste,.SD[,-ncol(.SD),with=FALSE],-1L])
        as.data.table(Rfast::rowTabulate(matrix(match(mat,vv,0L),nrow=.N)))
    },.SDcols=Task_Alpha:Task_Delta]

输出：

   userID Score Task_Alpha Task_Beta Task_Charlie Task_Delta Hard Hard Match Hard Easy Hard Hard Match Match Match Easy Match Hard Easy Match Easy Easy Easy
1:   3108 -8.00       Easy      Easy         Easy       Easy         0          0         0          0           0          0         0          0         3
2:   3207  3.00       Hard      Easy        Match      Match         0          0         0          0           1          1         1          0         0
3:   3350  5.78       Hard      Easy         Hard       Hard         1          0         1          0           0          0         1          0         0
4:   3961 10.00       Easy      <NA>         Hard       Hard         1          0         0          0           0          0         0          0         0
5:   4021 10.00       Easy      Easy         <NA>       Hard         0          0         0          0           0          0         0          0         1

数据：

library(data.table)
library(Rfast)
DT <- structure(list(
    userID = c(3108L,3207L,3350L,3961L,4021L),Score = c(-8,3,5.78,10,10),Task_Alpha = structure(c(1L,2L,1L,1L),.Label = c("Easy","Hard"),class = "factor"),Task_Beta = structure(c(1L,NA,.Label = "Easy",Task_Charlie = structure(c(1L,3L,NA),"Hard","Match"),Task_Delta = structure(c(1L,2L),class = "factor")),class = "data.frame",row.names = c(NA,-5L))
setDT(DT)

了解这种方法在实际数据集上的运行速度以及实际数据集是否很大将很有趣。

编辑：添加了一些时间

library(data.table)
nr <- 1e6
vec <- c('Hard','Easy',NA)
DT <- data.table(userID=1:nr,Task_Alpha=sample(vec,nr,TRUE),Task_Beta=sample(vec,Task_Charlie=sample(vec,Task_Delta=sample(vec,TRUE))
df <- as.data.frame(DT)
DT0 <- copy(DT)
DT1 <- copy(DT)
DT2 <- copy(DT)

mtd0 <- function() {
    t(apply(df[-1L],function(i) {
        i1 <- paste(i[-length(i)],i[-1L]);
        i1 <- factor(i1,'Hard'))));
        table(i1)
    }))
}

mtd1 <- function() {
    f_cols <- names(DT0)[ sapply( DT0,is.factor ) ]
    DT0[,(f_cols) := lapply(.SD,as.character),.SDcols = f_cols ]
    #melt to long format
    DT.melt <- melt( DT0,id.vars = "userID",measure.vars = patterns( task = "^Task_"))
    #set order of Aplha-Beta-etc...
    DT.melt[ grepl( "Alpha",variable ),order := 1 ]
    DT.melt[ grepl( "Beta",order := 2 ]
    DT.melt[ grepl( "Charlie",order := 3 ]
    DT.melt[ grepl( "Delta",order := 4 ]
    #order DT.melt
    setorder( DT.melt,userID,order )
    #fill in codes EE,etc...
    DT.melt[,`:=`( code1 = gsub( "(^.).*","\\1",value ),code2 = gsub( "(^.).*",shift( value,type = "lead" ) ) ),by = userID ]
    #filter only rows without NA
    DT.melt <- DT.melt[ complete.cases( DT.melt ) ]
    #cast to wide output
    dcast( DT.melt,userID ~ paste0( code2,code1 ),fun.aggregate = length )
}

mtd2 <- function() {
    v <- c('Hard','Easy')
    vv <- do.call(paste,v))
    DT2[,.SDcols=Task_Alpha:Task_Delta]
}

bench::mark(mtd0(),mtd1(),mtd2(),check=FALSE)

时间：

# A tibble: 3 x 13
  expression      min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result                     memory                 time     gc              
  <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list>                     <list>                 <list>   <list>          
1 mtd0()        2.19m    2.19m   0.00760     252MB    2.26      1   297      2.19m <int[,9] [1,000,000 x 9]>  <df[,3] [171,481 x 3]> <bch:tm> <tibble [1 x 3]>
2 mtd1()       33.16s   33.16s   0.0302      856MB    0.754     1    25     33.16s <df[,10] [843,688 x 10]>   <df[,3] [8,454 x 3]>   <bch:tm> <tibble [1 x 3]>
3 mtd2()     844.95ms 844.95ms   1.18        298MB    1.18      1     1   844.95ms <df[,14] [1,000 x 14]> <df[,912 x 3]>   <bch:tm> <tibble [1 x 3]>

library(data.table)
#set df to data.table
setDT(df)
#convert factor-columns to character
f_cols <- names(df)[ sapply( df,is.factor ) ]
df[,.SDcols = f_cols ]
#melt to long format
DT.melt <- melt( df,measure.vars = patterns( task = "^Task_"),variable.name = grep("^Task",names(df),value = TRUE) )
#set order of Aplha-Beta-etc...
DT.melt[ grepl( "Alpha",order := 1 ]
DT.melt[ grepl( "Beta",order := 2 ]
DT.melt[ grepl( "Charlie",order := 3 ]
DT.melt[ grepl( "Delta",order := 4 ]
#order DT.melt
setorder( DT.melt,order )
#fill in codes EE,etc...
DT.melt[,by = userID ]
#filter only rows without NA
DT.melt <- DT.melt[ complete.cases( DT.melt ) ]
str(DT.melt)
#cast to wide output
dcast( DT.melt,fun.aggregate = length )

#    userID EE EH EM HE HH MM
# 1:   3108  3  0  0  0  0  0
# 2:   3207  0  0  1  1  0  1
# 3:   3350  0  1  0  1  1  0
# 4:   3961  0  0  0  0  1  0
# 5:   4021  1  0  0  0  0  0

涉及dplyr和tidyr的一个选项可能是：

df %>%
 select(-Score) %>%
 pivot_longer(names_to = "variables",values_to = "values",-userID) %>%
 select(-variables) %>%
 group_by(userID) %>%
 filter(!is.na(values) & !is.na(lag(values,default = first(values)))) %>%
 mutate(variables = paste(values,lead(values),sep = "-")) %>%
 filter(row_number() != n()) %>%
 count(variables) %>%
 ungroup() %>%
 pivot_wider(names_from = "variables",values_from = "n",values_fill = list(n = 0))

  userID `Easy-Easy` `Easy-Match` `Hard-Easy` `Match-Match` `Easy-Hard` `Hard-Hard`
   <int>       <int>        <int>       <int>         <int>       <int>       <int>
1   3108           3            0           0             0           0           0
2   3207           0            1           1             1           0           0
3   3350           0            0           1             0           1           1
4   3961           0            0           0             0           1           0
5   4021           1            0           0             0           0           0

首先，它将数据从宽格式转换为长格式。其次，它按userID分组，排除缺失值和滞后缺失值。第三，它将当前值和潜在值连接起来。第四，它计算每个userID中给定组合的出现。最后，它将数据转换为宽格式。

如果您还想要不存在的组合：

x <- c("Easy","Match")

df %>%
 select(-Score) %>%
 pivot_longer(names_to = "variables",sep = "-")) %>%
 filter(row_number() != n()) %>%
 count(variables) %>%
 complete(variables = c(outer(x,x,FUN = paste,sep = "-")),fill = list(n = 0)) %>%
 ungroup() %>%
 pivot_wider(names_from = "variables",values_from = "n")

如何按有序顺序计算因子

laolvye 回答：如何按有序顺序计算因子

大家都在问