一种可能的方法是首先对1671个独特基因进行采样,将数据集子集化为共享那些基因并在r
集内计数的那些。这是data.table
中这种方法的实现:
#had to create some dummy data as not clear what the data is like
set.seed(0L)
nr <- 15e4
nSNP <- 1e3
nGene <- 1e4
ncount <- 1:14
r <- c(1,3,7,9,14)
SYN_data <- data.table(SNP=sample(nSNP,nr,TRUE),Gene=sample(nGene,count=sample(ncount,TRUE))
ncnt <- 2545
ng <- 1671
#sample 1671 genes
g <- SYN_data[,sample(unique(Gene),ng)]
#subset and sample the dataset
ix <- SYN_data[Gene %in% g & count %in% r,sample(.I,1L),Gene]$V1
ans <- rbindlist(list(
SYN_data[ix],SYN_data[-ix][Gene %in% g & count %in% r][,.SD[sample(.I,ncnt - ng)]]))
ans[,uniqueN(Gene)]
#1662 #not enough Gene in this dummy dataset
输出:
SNP Gene count
1: 816 1261 14
2: 7 8635 1
3: 132 7457 1
4: 22 3625 3
5: 396 7640 7
---
2534: 423 6387 3
2535: 936 3908 7
2536: 346 9654 14
2537: 182 7492 3
2538: 645 635 1
,
尝试使用以下内容:
library(dplyr)
no_of_rows <- 2545
no_of_unique_gene <- 1671
temp <- SYN_data
while(n_distinct(temp$Gene) != no_of_unique_gene) {
gene <- sample(unique(SYN_data$Gene),no_of_unique_gene)
temp <- SYN_data[SYN_data$V23 %in% unique(r) & SYN_data$Gene %in% gene,]
}
part1 <- temp %>% group_by(Gene) %>% sample_n(floor(no_of_rows/no_of_unique_gene))
part2 <- temp %>% anti_join(part1) %>% sample_n(no_of_rows - nrow(part1))
final <- bind_rows(part1,part2)
,然后检查length(unique(final$Gene))
。
本文链接:https://www.f2er.com/3085565.html