R:了解“future_apply”的局限性

我正在使用 R 编程语言。我正在尝试更多地了解“并行计算和并行处理”,这将使我能够更有效地使用计算机对大量数据执行更快的操作。

例如,这是一个 R 例程,它生成一个大的点网格,然后对这些点的每一行执行一系列计算(由“函数”定义)并保存结果:

library(dplyr)
library(data.table)

results_table <- data.frame()

grid_function <- function(train_data,random_1,random_2,random_3,random_4,split_1,split_2,split_3) {
    
    
    
    #bin data according to random criteria
    train_data <- train_data %>% mutate(cat = ifelse(a1 <= random_1 & b1 <= random_3,"a",ifelse(a1 <= random_2 & b1 <= random_4,"b","c")))
    
    train_data$cat = as.factor(train_data$cat)
    
    #new splits
    a_table = train_data %>%
        filter(cat == "a") %>%
        select(a1,b1,c1,cat)
    
    b_table = train_data %>%
        filter(cat == "b") %>%
        select(a1,cat)
    
    c_table = train_data %>%
        filter(cat == "c") %>%
        select(a1,cat)
    
    
    #calculate random quantile ("quant") for each bin
    
    table_a = data.frame(a_table%>% group_by(cat) %>%
                             mutate(quant = quantile(c1,prob = split_1)))
    
    table_b = data.frame(b_table%>% group_by(cat) %>%
                             mutate(quant = quantile(c1,prob = split_2)))
    
    table_c = data.frame(c_table%>% group_by(cat) %>%
                             mutate(quant = quantile(c1,prob = split_3)))
    
    
    
    
    #create a new variable ("diff") that measures if the quantile is bigger tha the value of "c1"
    table_a$diff = ifelse(table_a$quant > table_a$c1,1,0)
    table_b$diff = ifelse(table_b$quant > table_b$c1,0)
    table_c$diff = ifelse(table_c$quant > table_c$c1,0)
    
    #group all tables
    
    final_table = rbind(table_a,table_b,table_c)
    
    #create a table: for each bin,calculate the average of "diff"
    final_table_2 = data.frame(final_table %>%
                                   group_by(cat) %>%
                                   summarize(
                                       mean = mean(diff)
                                   ))
    
    #add "total mean" to this table
    final_table_2 = data.frame(final_table_2 %>% add_row(cat = "total",mean = mean(final_table$diff)))
    
    #format this table: add the random criteria to this table for reference
    final_table_2$random_1 = random_1
    
    final_table_2$random_2 = random_2
    
    final_table_2$random_3 = random_3
    
    final_table_2$random_4 = random_4
    
    final_table_2$split_1 = split_1
    
    final_table_2$split_2 = split_2
    
    final_table_2$split_3 = split_3
    
    
    
    
    results_table <- rbind(results_table,final_table_2)
    
    final_results = dcast(setDT(results_table),random_1 + random_2 + random_3 + random_4 + split_1 + split_2 + split_3 ~ cat,value.var = 'mean')
    
}

# create some data for this example
a1 = rnorm(1000,100,10)
b1 = rnorm(1000,5)
c1 = sample.int(1000,1000,replace = TRUE)
train_data = data.frame(a1,c1)




#grid
random_1 <- seq(80,5)
random_2 <- seq(85,120,5)
random_3 <- seq(85,5)
random_4 <- seq(90,5)
split_1 =  seq(0,0.1)
split_2 =  seq(0,0.1)
split_3 =  seq(0,0.1)
DF_1 <- expand.grid(random_1,split_3)

#reduce the size of the grid for this example
DF_1 = DF_1[1:100,]

colnames(DF_1) <- c("random_1","random_2","random_3","random_4","split_1","split_2","split_3")

train_data_new <- copy(train_data)


resultdf1 <- apply(DF_1,# 1 means rows
                   FUN=function(x){
                       do.call(
                           # Call Function grid_function2 with the arguments in
                           # a list
                           grid_function,# force list type for the arguments
                           c(list(train_data_new),as.list(
                               # make the row to a named vector
                               unlist(x)
                           )
                           ))
                   }
)

l = resultdf1
final_output = rbindlist(l,fill = TRUE)

代码运行完成后,最终输出如下所示:

#this is the final output from my code - should be a data frame:

head(final_output)
   random_1 random_2 random_3 random_4 split_1 split_2 split_3   b         c total  a
1:       80       85       85       90     0.5     0.5     0.5 0.5 0.5000000 0.500 NA
2:       85       85       85       90     0.5     0.5     0.5 0.5 0.5000000 0.500 NA
3:       90       85       85       90     0.5     0.5     0.5 0.5 0.5000000 0.500 NA
4:       95       85       85       90     0.5     0.5     0.5 0.5 0.4994985 0.499  0
5:      100       85       85       90     0.5     0.5     0.5 0.5 0.4994985 0.499  0
6:       80       90       85       90     0.5     0.5     0.5 0.5 0.4989960 0.499 NA

问题:现在,我想看看是否可以为“更大”的网格运行上述代码 - 例如“DF_1”有数百万行。

到目前为止我尝试过的:我发现了这个名为“futre.apply”(https://cran.r-project.org/web/packages/future.apply/vignettes/future.apply-1-overview.html)的库,它可以在对更大的数据集执行计算时更好地利用计算机的功能.

我从上面的问题重新定义了“完整网格”:

random_1 <- seq(80,5)
    random_2 <- seq(85,5)
    random_3 <- seq(85,5)
    random_4 <- seq(90,5)
    split_1 =  seq(0,0.1)
    split_2 =  seq(0,0.1)
    split_3 =  seq(0,0.1)
    DF_1 <- expand.grid(random_1,split_3)

然后,我尝试设置“future.apply”库以更有效的方式执行代码:

library(future.apply)
 args = list(X = DF_1,MARGIN = 1,FUN = function(x){ # I did not run it entirely because I guesss it can be very long so only the 200 st rows
      do.call(
        # Call Function grid_function with the arguments in
        # a list
        grid_function,# force list type for the arguments
        c(list(train_data_new),as.list(
          # make the row to a named vector
          unlist(x)
        )
        ))
    })

#launch code to run in parallel (note: if you set DF_1 = DF_1[1:100,],then "a" runs very quickly)
 a = do.call(future_apply,args)

这段代码已经运行了大约 1 小时 - 看到网格非常大:我不确定这是否会最终完成运行,还是会无限期运行(即“future_apply”的限制)

问题:有谁知道这段代码是否有可能完成运行,或者是否有其他方法可以“并行化”这段代码并使其更有效?

谢谢

wangsdfsdfsfdgdf 回答:R:了解“future_apply”的局限性

暂时没有好的解决方案,如果你有好的解决方案,请发邮件至:iooj@foxmail.com
本文链接:https://www.f2er.com/13761.html

大家都在问