借用来自@ r2evans的样本数据,我们可以在此处使用reshape2::dcast()
。
这是一种不寻常的用法,因为我们仅将其用于聚合和页边距属性,而不是将其重塑/旋转到更宽的范围。
请注意,data.table::dcast()
支持多个value.var
,但是不支持边距,因此这里无济于事。
library(reshape2)
dcast(dat,Flag1 + Flag2 + Flag3 ~ .,fun.aggregate = sum,value.var = "Type1",margins = TRUE)
#> Flag1 Flag2 Flag3 .
#> 1 Level1 A FIFTH 1
#> 2 Level1 A FIRST 2
#> 3 Level1 A FOURTH 9
#> 4 Level1 A SECOND 1
#> 5 Level1 A SIXTH 1
#> 6 Level1 A THIRD 3
#> 7 Level1 A (all) 17
#> 8 Level1 B FIFTH 3
#> 9 Level1 B FIRST 0
#> 10 Level1 B FOURTH 4
#> 11 Level1 B SECOND 3
#> 12 Level1 B SIXTH 0
#> 13 Level1 B THIRD 5
#> 14 Level1 B (all) 15
#> 15 Level1 (all) (all) 32
#> 16 Level2 A FIFTH 1
#> 17 Level2 A FIRST 2
#> 18 Level2 A FOURTH 9
#> 19 Level2 A SECOND 1
#> 20 Level2 A SIXTH 1
#> 21 Level2 A THIRD 3
#> 22 Level2 A (all) 17
#> 23 Level2 B FIFTH 3
#> 24 Level2 B FIRST 0
#> 25 Level2 B FOURTH 4
#> 26 Level2 B SECOND 3
#> 27 Level2 B SIXTH 0
#> 28 Level2 B THIRD 5
#> 29 Level2 B (all) 15
#> 30 Level2 (all) (all) 32
#> 31 Level3 A FIFTH 1
#> 32 Level3 A FIRST 2
#> 33 Level3 A FOURTH 9
#> 34 Level3 A SECOND 1
#> 35 Level3 A SIXTH 1
#> 36 Level3 A THIRD 3
#> 37 Level3 A (all) 17
#> 38 Level3 B FIFTH 3
#> 39 Level3 B FIRST 0
#> 40 Level3 B FOURTH 4
#> 41 Level3 B SECOND 3
#> 42 Level3 B SIXTH 0
#> 43 Level3 B THIRD 5
#> 44 Level3 B (all) 15
#> 45 Level3 (all) (all) 32
#> 46 (all) (all) (all) 96
然后可以重命名列,或者为避免重命名,只需先创建一个常量列即可:
dat$whatev <- "Sum"
dcast(dat,Flag1 + Flag2 + Flag3 ~ whatev,margins = TRUE)
在同一张表中获取所有金额
由于提供了其他答案,因此,如果您希望所有金额都在同一张表中,可以执行以下操作:
library(reshape2)
sum_with_margins <- function(col) {
dat$whatev <- col
dcast(dat,value.var = col,margins = paste0("Flag",1:3))
}
Reduce(merge,lapply(paste0("Type",1:3),sum_with_margins))
#> Flag1 Flag2 Flag3 Type1 Type2 Type3
#> 1 (all) (all) (all) 96 948 0
#> 2 Level1 (all) (all) 32 316 0
#> 3 Level1 A (all) 17 69 0
#> 4 Level1 A FIFTH 1 22 0
#> 5 Level1 A FIRST 2 0 0
#> 6 Level1 A FOURTH 9 18 0
#> 7 Level1 A SECOND 1 9 0
#> 8 Level1 A SIXTH 1 13 0
#> 9 Level1 A THIRD 3 7 0
#> 10 Level1 B (all) 15 247 0
#> 11 Level1 B FIFTH 3 40 0
#> 12 Level1 B FIRST 0 0 0
#> 13 Level1 B FOURTH 4 96 0
#> 14 Level1 B SECOND 3 9 0
#> 15 Level1 B SIXTH 0 17 0
#> 16 Level1 B THIRD 5 85 0
#> 17 Level2 (all) (all) 32 316 0
#> 18 Level2 A (all) 17 69 0
#> 19 Level2 A FIFTH 1 22 0
#> 20 Level2 A FIRST 2 0 0
#> 21 Level2 A FOURTH 9 18 0
#> 22 Level2 A SECOND 1 9 0
#> 23 Level2 A SIXTH 1 13 0
#> 24 Level2 A THIRD 3 7 0
#> 25 Level2 B (all) 15 247 0
#> 26 Level2 B FIFTH 3 40 0
#> 27 Level2 B FIRST 0 0 0
#> 28 Level2 B FOURTH 4 96 0
#> 29 Level2 B SECOND 3 9 0
#> 30 Level2 B SIXTH 0 17 0
#> 31 Level2 B THIRD 5 85 0
#> 32 Level3 (all) (all) 32 316 0
#> 33 Level3 A (all) 17 69 0
#> 34 Level3 A FIFTH 1 22 0
#> 35 Level3 A FIRST 2 0 0
#> 36 Level3 A FOURTH 9 18 0
#> 37 Level3 A SECOND 1 9 0
#> 38 Level3 A SIXTH 1 13 0
#> 39 Level3 A THIRD 3 7 0
#> 40 Level3 B (all) 15 247 0
#> 41 Level3 B FIFTH 3 40 0
#> 42 Level3 B FIRST 0 0 0
#> 43 Level3 B FOURTH 4 96 0
#> 44 Level3 B SECOND 3 9 0
#> 45 Level3 B SIXTH 0 17 0
#> 46 Level3 B THIRD 5 85 0
,
这是我想给你所有边缘的镜头。
预先
eg <- do.call(expand.grid,c(lapply(dat[1:3],function(a) c(NA,unique(a))),stringsAsFactors = FALSE))
head(eg)
out <- do.call(
Map,c(unname(eg),list(f = function(f1,f2,f3) {
subx <- subset(dat,(is.na(f1) | f1 == Flag1) &
(is.na(f2) | f2 == Flag2) &
(is.na(f3) | f3 == Flag3))
subx <- subx[,setdiff(colnames(subx),c("Flag1","Flag2","Flag3"))]
c(sapply(subx,sum),"(all)" = sum(unlist(subx)))
})))
out <- cbind.data.frame(
sapply(eg,function(a) ifelse(is.na(a),"(all)",as.character(a))),do.call(rbind,out),stringsAsFactors = FALSE)
rownames(out) <- NULL # cosmetic
out <- out[order(out$Flag1 == "(all)",out$Flag1,out$Flag2 == "(all)",out$Flag2,out$Flag3 == "(all)",out$Flag3),]
out[c(1,2,7,8,13,14,20,21,64,84),] # cherry-pick for this view
# Flag1 Flag2 Flag3 Type1 Type2 Type3 (all)
# 66 Level1 A FIFTH 1 22 0 23
# 18 Level1 A FIRST 2 0 0 2
# 6 Level1 A (all) 17 69 0 86
# 70 Level1 B FIFTH 3 40 0 43
# 46 Level1 B THIRD 5 85 0 90
# 10 Level1 B (all) 15 247 0 262
# 38 Level1 (all) THIRD 8 92 0 100
# 2 Level1 (all) (all) 32 316 0 348
# 65 (all) A FIFTH 3 66 0 69
# 1 (all) (all) (all) 96 948 0 1044
演练
-
第一部分生成一个框架,其中包含现有Flag*
变量的所有可能组合以及特殊的NA
(稍后说明):
eg <- do.call(expand.grid,stringsAsFactors = FALSE))
head(eg)
# Flag1 Flag2 Flag3
# 1 <NA> <NA> <NA>
# 2 Level1 <NA> <NA>
# 3 Level2 <NA> <NA>
# 4 Level3 <NA> <NA>
# 5 <NA> A <NA>
# 6 Level1 A <NA>
nrow(eg)
# [1] 84
-
(这似乎很复杂。)Map
遍历eg
的每一行,但通常将单个列表/向量分量作为参数。因为eg
已经是list
(data.frame
),所以我需要使用do.call
来传递eg
中的列作为单独的(未命名的)参数。 (从技术上讲,我本可以做Map(function(...)...,eg[,1],2],3])
,但是如果您使用的是非恒定列或更多列,那可以说是不太笼统了。)
该函数的内部,它根据简单的标准对整个帧进行子集化:如果Flag*
中的eg
变量为NA
,则接受该标志的所有值,否则完全匹配。 (请注意,这是在做一些范围突破,因为它已经超出了匿名函数的范围,可以找到dat
。)
out <- do.call(
Map,"(all)" = sum(unlist(subx)))
})))
head(out,n=3)
# $<NA>
# Type1 Type2 Type3 (all)
# 96 948 0 1044
# $Level1
# Type1 Type2 Type3 (all)
# 32 316 0 348
# $Level2
# Type1 Type2 Type3 (all)
# 32 316 0 348
-
将NA
重命名为您的审美观(all)
,确保character
(不是factor
)。 (删除用cbind...
创建的行名,只是修饰。)
out <- cbind.data.frame(
sapply(eg,stringsAsFactors = FALSE)
rownames(out) <- NULL
-
按标志排序,最后(all)
。
out <- out[order(out$Flag1 == "(all)",]
我意识到Flag3
的顺序不是上下文顺序的。为此,我建议使用factor
。
免责声明:我尚未验证所有款项的正确性。
数据,没有factor
:
dat <- read.table(header=TRUE,stringsAsFactors=FALSE,text="
Flag1 Flag2 Flag3 Type1 Type2 Type3
1 Level1 A FIRST 2 0 0
2 Level1 A SECOND 1 9 0
3 Level1 A THIRD 3 7 0
4 Level1 A FOURTH 9 18 0
5 Level1 A FIFTH 1 22 0
6 Level1 A SIXTH 1 13 0
7 Level1 B FIRST 0 0 0
8 Level1 B SECOND 3 9 0
9 Level1 B THIRD 5 85 0
10 Level1 B FOURTH 4 96 0
11 Level1 B FIFTH 3 40 0
12 Level1 B SIXTH 0 17 0
22 Level2 A FIRST 2 0 0
23 Level2 A SECOND 1 9 0
24 Level2 A THIRD 3 7 0
25 Level2 A FOURTH 9 18 0
26 Level2 A FIFTH 1 22 0
27 Level2 A SIXTH 1 13 0
28 Level2 B FIRST 0 0 0
29 Level2 B SECOND 3 9 0
30 Level2 B THIRD 5 85 0
31 Level2 B FOURTH 4 96 0
32 Level2 B FIFTH 3 40 0
33 Level2 B SIXTH 0 17 0
34 Level3 A FIRST 2 0 0
35 Level3 A SECOND 1 9 0
36 Level3 A THIRD 3 7 0
37 Level3 A FOURTH 9 18 0
38 Level3 A FIFTH 1 22 0
39 Level3 A SIXTH 1 13 0
40 Level3 B FIRST 0 0 0
41 Level3 B SECOND 3 9 0
42 Level3 B THIRD 5 85 0
43 Level3 B FOURTH 4 96 0
44 Level3 B FIFTH 3 40 0
45 Level3 B SIXTH 0 17 0")
,
是的,这是一个很好的问题。...
我使您的数据如下:
data "c:\blah\test.sas7bdat";
input id Flag1 $ Flag2 $ Flag3 $ Type1 Type2 Type3;
datalines;
1 Level1 A FIRST 2 0 0
2 Level1 A SECOND 1 9 0
3 Level1 A THIRD 3 7 0
4 Level1 A FOURTH 9 18 0
5 Level1 A FIFTH 1 22 0
6 Level1 A SIXTH 1 13 0
7 Level1 B FIRST 0 0 0
8 Level1 B SECOND 3 9 0
9 Level1 B THIRD 5 85 0
10 Level1 B FOURTH 4 96 0
11 Level1 B FIFTH 3 40 0
12 Level1 B SIXTH 0 17 0
22 Level2 A FIRST 2 0 0
23 Level2 A SECOND 1 9 0
24 Level2 A THIRD 3 7 0
25 Level2 A FOURTH 9 18 0
26 Level2 A FIFTH 1 22 0
27 Level2 A SIXTH 1 13 0
28 Level2 B FIRST 0 0 0
29 Level2 B SECOND 3 9 0
30 Level2 B THIRD 5 85 0
31 Level2 B FOURTH 4 96 0
32 Level2 B FIFTH 3 40 0
33 Level2 B SIXTH 0 17 0
34 Level3 A FIRST 2 0 0
35 Level3 A SECOND 1 9 0
36 Level3 A THIRD 3 7 0
37 Level3 A FOURTH 9 18 0
38 Level3 A FIFTH 1 22 0
39 Level3 A SIXTH 1 13 0
40 Level3 B FIRST 0 0 0
41 Level3 B SECOND 3 9 0
42 Level3 B THIRD 5 85 0
43 Level3 B FOURTH 4 96 0
44 Level3 B FIFTH 3 40 0
45 Level3 B SIXTH 0 17 0
;
run;
在SAS中,我得到了这样的摘要:
PROC SUMMARY data="c:\blah\test.sas7bdat";
class Flag1 Flag2 Flag3;
var Type1;
output out =final_data (drop = _type_ _freq_) Sum=sum ;
run;
要将数据加载到R中,我使用了避风港软件包。然后,我使用了dplyr软件包中的函数进行处理
library(haven)
library(dplyr)
# the read_sas() function is in the haven package
test <- read_sas("c:\\blah\\test.sas7bdat")
# This uses dplry functions for nway
done <- test %>%
group_by(Flag1,Flag2,Flag3) %>%
summarise(sum(Type1)) %>%
ungroup() %>%
rename(Sum = `sum(Type1)`)
# This uses dplry functions to do all the subgroups
done1 <- test %>%
group_by(Flag1) %>%
summarise(sum(Type1)) %>%
rename(Sum = `sum(Type1)`)
done2 <- test %>%
group_by(Flag1,Flag2) %>%
summarise(sum(Type1)) %>%
rename(Sum = `sum(Type1)`)
done3 <- test %>%
group_by(Flag1,Flag3) %>%
summarise(sum(Type1)) %>%
rename(Sum = `sum(Type1)`)
all <- bind_rows(done3,done2,done1)
好消息是dplyr使用名称易于理解的动词功能。如果您想了解更多信息,请查看R for Data Science
从理论上讲,可以将构成“完成”对象的代码放入函数中,然后使用purrr包中的map函数运行。我还没有弄清楚如何列出所有变量。对此必须有一个功能,但我找不到。这篇文章可以帮助您:
Yihui on combinations
stringsOfVariables <- capture.output(
for (i in 1:n) {
if (i == 1) {
for (j in 1:n) {
cat(x[j],"\n")
}
}
else {
for (j in 1:(n - i + 1)) {
for (k in (j + i - 1):n) {
cat(c(x[j:(j + i - 2)],x[k]),"\n",sep = ",")
}
}
}
}
)
stringsOfVariables
这是一个对任何变量集进行汇总的函数。
library(rlang)
counts <- function(...) {
vars <- enquos(...)
test %>%
group_by(!!!vars) %>%
summarise(sum(Type1))
}
done3 <- counts(Flag1,Flag3)
也许其他人可以添加丢失的tidyverse位...