您可以将数据帧urlpatterns = [
url(r'^admin/',admin.site.urls),url(r'^',include('dellserver.urls')),]
与df
或map*
函数中的每个值进行比较,计算所得布尔矩阵的行和,然后将输出与原始数据框:
*apply
,
library(data.table)
DT <- fread("userID Score Task_Alpha Task_Beta Task_Charlie Task_Delta
3108 -8.00 Easy Easy Easy Easy
3207 3.00 Hard Easy Match Match
3350 5.78 Hard Easy Hard Hard
3961 10.00 Easy NA Hard Hard
4021 10.00 Easy Easy NA Hard
")
DT.melt <- melt( DT,id.vars = "userID",measure.vars = patterns( task = "^Task_") )
dcast( DT.melt,userID ~ value,fun.aggregate = length )
# userID NA Easy Hard Match
# 1: 3108 0 4 0 0
# 2: 3207 0 1 1 2
# 3: 3350 0 1 3 0
# 4: 3961 1 1 2 0
# 5: 4021 1 2 1 0
,
可以通过按行使用apply
来获得对第一部分的答案,并可以使用table
来计算每一行中因子水平的出现情况
cbind(df[1],t(apply(df[-c(1,2)],1,function(x)
table(factor(x,levels = c("Easy","Hard","Match"))))))
# userID Easy Hard Match
#1 3108 4 0 0
#2 3207 1 1 2
#3 3350 1 3 0
#4 3961 1 2 0
#5 4021 2 1 0
在tidyverse
中,我们可以将数据转换为长格式,删除NA
的值,count
出现userID
和value
并取回数据到宽格式。
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = starts_with("Task"),values_drop_na = TRUE) %>%
count(userID,value) %>%
pivot_wider(names_from = value,values_from = n,values_fill = list(n = 0))
数据
df <- structure(list(userID = c(3108L,3207L,3350L,3961L,4021L),Score = c(-8,3,5.78,10,10),Task_Alpha = structure(c(1L,2L,1L,1L),.Label = c("Easy","Hard"),class = "factor"),Task_Beta = structure(c(1L,NA,.Label = "Easy",Task_Charlie = structure(c(1L,3L,NA),"Match"),Task_Delta = structure(c(1L,2L),class = "factor")),class = "data.frame",row.names = c(NA,-5L))
,
另一个使用Rfast::rowTabulate
的选项
v <- c('Hard','Match','Easy',NA)
DT[,(v) := as.data.table(Rfast::rowTabulate(matrix(match(as.matrix(.SD),v),nrow=.N))),.SDcols=Task_Alpha:Task_Delta]
输出:
userID Score Task_Alpha Task_Beta Task_Charlie Task_Delta Hard Match Easy NA
1: 3108 -8.00 Easy Easy Easy Easy 0 0 4 0
2: 3207 3.00 Hard Easy Match Match 1 2 1 0
3: 3350 5.78 Hard Easy Hard Hard 3 0 1 0
4: 3961 10.00 Easy <NA> Hard Hard 2 0 1 1
5: 4021 10.00 Easy Easy <NA> Hard 1 0 2 1
来自Wimpel的数据:
library(data.table)
DT <- fread("userID Score Task_Alpha Task_Beta Task_Charlie Task_Delta
3108 -8.00 Easy Easy Easy Easy
3207 3.00 Hard Easy Match Match
3350 5.78 Hard Easy Hard Hard
3961 10.00 Easy NA Hard Hard
4021 10.00 Easy Easy NA Hard
")
了解这种方法在实际数据集上的运行速度以及实际数据集是否很大将很有趣。
编辑:添加了计时
library(data.table)
set.seed(0L)
nr <- 1e6
v <- c('Hard',NA)
DT <- data.table(userID=1:nr,Task_Alpha=sample(v,nr,TRUE),Task_Beta=sample(v,Task_Charlie=sample(v,Task_Delta=sample(v,TRUE))
df <- as.data.frame(DT)
mtd0 <- function() {
t(apply(df[-1L],function(x)
table(factor(x,"Match")))))
}
mtd1 <- function() {
DT.melt <- melt( DT,measure.vars = patterns( task = "^Task_") )
dcast( DT.melt,fun.aggregate = length )
}
mtd2 <- function() {
DT[,Rfast::rowTabulate(matrix(match(as.matrix(.SD),nrow=.N)),.SDcols=Task_Alpha:Task_Delta]
}
bench::mark(mtd0(),mtd1(),mtd2(),check=FALSE)
时间:
# A tibble: 3 x 13
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
1 mtd0() 54.7s 54.7s 0.0183 137MB 1.70 1 93 54.7s <int[,3] [1,000,000 x 3]> <df[,3] [107,168 x 3]> <bch:tm> <tibble [1 x 3]>
2 mtd1() 2.4s 2.4s 0.417 398MB 0.833 1 2 2.4s <df[,5] [1,000 x 5]> <df[,3] [12,517 x 3]> <bch:tm> <tibble [1 x 3]>
3 mtd2() 252.8ms 264.4ms 3.78 107MB 3.78 2 2 528.7ms <int[,4] [1,000 x 4]> <df[,3] [6,509 x 3]> <bch:tm> <tibble [2 x 3]>
,
如果您使用的是base R
,则以下内容可能会对您有所帮助:
df <- cbind(df,as.data.frame(sapply(c('Hard','Easy'),function(v) rowSums(df == v,na.rm = T))))
输出:
> df
userID Score Task_Alpha Task_Beta Task_Charlie Task_Delta Hard Match Easy
1 3108 -8.00 Easy Easy Easy Easy 0 0 4
2 3207 3.00 Hard Easy Match Match 1 2 1
3 3350 5.78 Hard Easy Hard Hard 3 0 1
4 3961 10.00 Easy <NA> Hard Hard 2 0 1
5 4021 10.00 Easy Easy <NA> Hard 1 0 2