我尝试在两个具有一个GPU的节点上使用分布式Keras从https://www.tensorflow.org/tutorials/distribute/keras运行官方示例。我在第一个节点上运行library(golem)
library(shiny)
library(shinydashboard)
library(shinydashboardPlus)
shinyApp(
ui = tags$body(class="skin-blue sidebar-mini control-sidebar-open",dashboardPagePlus(
header = dashboardheaderPlus(
enable_rightsidebar = TRUE,rightSidebarIcon = "gears"
),sidebar = dashboardSidebar(),body = dashboardBody(
golem::activate_js(),navbarPage("Navbar!",id = "tabactive",tabPanel("Summary",tabsetPanel(
id="tabB",type = "tabs",tabPanel("Plot3"),tabPanel("Plot4"))),tabPanel("Available funds",tabsetPanel(
id="tabA",tabPanel("Plot"),tabPanel("Plot2"))
))
),rightsidebar = rightSidebar(
background = "dark",rightSidebarTabContent(
id = 1,title = "Tab 1",icon = "desktop",active = TRUE,uiOutput("sl")
)
),title = "Right Sidebar"
)),server = function(input,output) {
output$sl <- renderUI({
req(input$tabactive)
sliderInput(
"obs","Number of observations:",min = 0,max = 1000,value = 500
)
})
output$sl2 <- renderUI({
req(input$tabactive)
sliderInput(
"obs2",max = 100,value = 50
)
})
output$sl3 <- renderUI({
req(input$tabactive)
sliderInput(
"obs3",value = 50
)
})
output$sl4 <- renderUI({
req(input$tabactive)
sliderInput(
"obs4",value = 50
)
})
observe({
if (input$tabactive == "Available funds" && input$tabA == "Plot"){
golem::invoke_js("showid","sl")
} else {
golem::invoke_js("hideid","sl")
}
})
observe({
if (input$tabactive == "Available funds" && input$tabA == "Plot2"){
golem::invoke_js("showid","sl2")
} else {
golem::invoke_js("hideid","sl2")
}
})
observe({
if (input$tabactive == "Summary" && input$tabB == "Plot3"){
golem::invoke_js("showid","sl3")
} else {
golem::invoke_js("hideid","sl3")
}
})
observe({
if (input$tabactive == "Summary" && input$tabB == "Plot4"){
golem::invoke_js("showid","sl4")
} else {
golem::invoke_js("hideid","sl4")
}
})
}
)
,在第二个节点上运行TF_CONFIG='{"cluster": {"worker": ["ip1:2222","ip2:2222"]},"task": {"index": 0,"type": "worker"}}' python3 test.py
。当我打印TF_CONFIG='{"cluster": {"worker": ["ip1:2222","task": {"index": 1,"type": "worker"}}'
时,它们都检测到GPU,但是出现以下错误。当我单独运行它们而没有device_lib.list_local_devices()
时,一切正常。有什么想法吗?
node1:
TF_CONFIG
node2:
2019-11-13 18:20:00.974896: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1326] Created TensorFlow device (/job:worker/replica:0/task:0/device:GPU:0 with 7658 MB memory) -> physical GPU (device: 0,pci bus id: 0000:84:00.0,compute capability: 3.5)
2019-11-13 18:20:00.977161: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:250] Initialize GrpcChannelCache for job worker -> {0 -> localhost:2222,1 -> ip2:2222}
2019-11-13 18:20:00.981865: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:365] Started server with target: grpc://localhost:2222
Segmentation fault (core dumped)