-
Notifications
You must be signed in to change notification settings - Fork 76
Closed
Labels
Description
branch_init()
creates deep copies of environments when it creates dynamic branches:
Lines 10 to 17 in 3760219
command <- command_clone(command) | |
deps <- union(command$deps, deps) | |
command$deps <- setdiff(deps, settings$dimensions) | |
command$seed <- tar_seed_create(child) | |
pedigree <- pedigree_new(settings$name, child, index) | |
settings <- settings_clone(settings) | |
settings$name <- child | |
store <- settings_produce_store(settings) |
This could be responsible for much of the slowness and memory consumption we see in heavily dynamically-branched pipelines. To test empirically, let's consider an environment with deep copies of nested environments (current approach in targets
):
list_deep <- function(n, data) {
out <- lapply(seq_len(n), function(x) {
outer <- new.env(parent = emptyenv())
outer$inner <- new.env(parent = emptyenv())
outer$inner$data <- data
outer
})
names(out) <- as.character(seq_len(n))
list2env(out, parent = emptyenv())
}
After much refactoring, targets
could adopt the following approach where certain inner environments are common to all branches:
list_shallow <- function(n, inner) {
out <- lapply(seq_len(n), function(x) {
outer <- new.env(parent = emptyenv())
outer$inner <- inner
outer
})
names(out) <- as.character(seq_len(n))
list2env(out, parent = emptyenv())
}
Let's see how speed and memory differ:
benchmarks <- function(n) {
inner <- new.env(parent = emptyenv())
data <- data.frame(x = rnorm(10))
inner$data <- data
seconds_shallow <- system.time(
bytes_shallow <- as.numeric(lobstr::obj_size(list_shallow(n, inner)))
)["elapsed"]
seconds_deep <- system.time(
bytes_deep <- as.numeric(lobstr::obj_size(list_deep(n, data)))
)["elapsed"]
tibble::tibble(
n = n,
bytes_shallow = bytes_shallow,
bytes_deep = bytes_deep,
bytes_proportion = bytes_shallow / bytes_deep,
seconds_shallow = seconds_shallow,
seconds_deep = seconds_deep,
seconds_proportion = seconds_shallow / seconds_deep
)
}
dplyr::bind_rows(lapply(10 ^ seq(3, 6), benchmarks))
#> # A tibble: 4 × 7
#> n bytes_shallow bytes_deep bytes_proportion seconds_shallow seconds_deep seconds_proportion
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1000 515072 906680 0.568 0.0360 0.00500 7.20
#> 2 10000 5137472 9057080 0.567 0.0320 0.0530 0.604
#> 3 100000 51361472 90561080 0.567 0.341 0.570 0.598
#> 4 1000000 513601472 905601080 0.567 5.75 10.3 0.558
In this experiment, switching to list_shallow()
cuts down slowness and memory consumption nearly by half.