|
57 | 57 | #' |
58 | 58 | #' ## Partitioned output |
59 | 59 | #' |
60 | | -#' It is possible to export a LazyFrame to multiple files, also called |
61 | | -#' *partitioned output*. A partition can be determined in several ways: |
62 | | -#' |
63 | | -#' - by key(s): split by the values of keys. The amount of files that can be |
64 | | -#' written is not limited. However, when writing beyond a certain amount of |
65 | | -#' files, the data for the remaining partitions is buffered before writing to |
66 | | -#' the file. |
67 | | -#' - by maximum number of rows: if the number of rows in a file reaches the |
68 | | -#' maximum number of rows, the file is closed and a new file is opened. |
69 | | - |
70 | | -# TODO: add this back when https://github.com/pola-rs/r-polars/issues/1522 is |
71 | | -# solved |
72 | | -# - by "sorted partition": this is a specialized version of partitioning by |
73 | | -# key. Whereas partitioning by key accepts data in any order, this scheme |
74 | | -# expects the input data to be pre-grouped or pre-sorted. This scheme suffers |
75 | | -# a lot less overhead, but may not be always applicable. Each new value of |
76 | | -# the key expressions starts a new partition, therefore repeating the same |
77 | | -# value multiple times may overwrite previous partitions. |
78 | | -# These partitioning schemes can be used with the functions `partition_by_key()`, |
79 | | -# `partition_by_max_size()`, and `partition_parted()`. See Examples below. |
80 | | - |
81 | | -#' |
82 | | -#' These partitioning schemes can be used with the functions `partition_by_key()` |
83 | | -#' and `partition_by_max_size()`. See Examples below. |
84 | | -#' |
85 | | -#' Writing a partitioned output usually requires setting `mkdir = TRUE` to |
86 | | -#' automatically create the required subfolders. |
| 60 | +#' It is possible to export data to multiple files based on various parameters, |
| 61 | +#' such as the values of some variables, or such that each file has a maximum |
| 62 | +#' number of rows. See [partition_by()] for more details. |
87 | 63 | #' |
88 | 64 | #' @return The input LazyFrame. |
89 | 65 | #' @export |
|
124 | 100 | #' out_path <- withr::local_tempdir() |
125 | 101 | #' sink_parquet(my_lf, partition_by_max_size(out_path, max_size = 5), mkdir = TRUE) |
126 | 102 | #' fs::dir_tree(out_path) # mtcars has 32 rows so we have 7 output files |
127 | | - |
128 | | -# TODO: add this back when https://github.com/pola-rs/r-polars/issues/1522 is |
129 | | -# solved |
130 | | -# |
131 | | -# # Split the LazyFrame by pre-sorted data: |
132 | | -# out_path <- withr::local_tempdir() |
133 | | -# my_lf |> |
134 | | -# arrange(am, cyl) |> |
135 | | -# sink_parquet(partition_parted(out_path, by = c("am", "cyl")), mkdir = TRUE) |
136 | | -# |
137 | | -# fs::dir_tree(out_path) |
138 | | -# |
139 | | -# # Careful when using partition_parted(): if the data is not presorted then |
140 | | -# # the output files may be incorrect! |
141 | | -# out_path <- withr::local_tempdir() |
142 | | -# sink_parquet(my_lf, partition_parted(out_path, by = c("am", "cyl")), mkdir = TRUE) |
143 | 103 | sink_parquet <- function( |
144 | 104 | .data, |
145 | 105 | path, |
@@ -264,22 +224,6 @@ sink_parquet <- function( |
264 | 224 | #' out_path <- withr::local_tempdir() |
265 | 225 | #' sink_csv(my_lf, partition_by_max_size(out_path, max_size = 5), mkdir = TRUE) |
266 | 226 | #' fs::dir_tree(out_path) # mtcars has 32 rows so we have 7 output files |
267 | | - |
268 | | -# TODO: add this back when https://github.com/pola-rs/r-polars/issues/1522 is |
269 | | -# solved |
270 | | -# |
271 | | -# # Split the LazyFrame by pre-sorted data: |
272 | | -# out_path <- withr::local_tempdir() |
273 | | -# my_lf |> |
274 | | -# arrange(am, cyl) |> |
275 | | -# sink_csv(partition_parted(out_path, by = c("am", "cyl")), mkdir = TRUE) |
276 | | -# |
277 | | -# fs::dir_tree(out_path) |
278 | | -# |
279 | | -# # Careful when using partition_parted(): if the data is not presorted then |
280 | | -# # the output files may be incorrect! |
281 | | -# out_path <- withr::local_tempdir() |
282 | | -# sink_csv(my_lf, partition_parted(out_path, by = c("am", "cyl")), mkdir = TRUE) |
283 | 227 | sink_csv <- function( |
284 | 228 | .data, |
285 | 229 | path, |
@@ -417,22 +361,6 @@ sink_csv <- function( |
417 | 361 | #' out_path <- withr::local_tempdir() |
418 | 362 | #' sink_ipc(my_lf, partition_by_max_size(out_path, max_size = 5), mkdir = TRUE) |
419 | 363 | #' fs::dir_tree(out_path) # mtcars has 32 rows so we have 7 output files |
420 | | - |
421 | | -# TODO: add this back when https://github.com/pola-rs/r-polars/issues/1522 is |
422 | | -# solved |
423 | | -# |
424 | | -# # Split the LazyFrame by pre-sorted data: |
425 | | -# out_path <- withr::local_tempdir() |
426 | | -# my_lf |> |
427 | | -# arrange(am, cyl) |> |
428 | | -# sink_ipc(partition_parted(out_path, by = c("am", "cyl")), mkdir = TRUE) |
429 | | -# |
430 | | -# fs::dir_tree(out_path) |
431 | | -# |
432 | | -# # Careful when using partition_parted(): if the data is not presorted then |
433 | | -# # the output files may be incorrect! |
434 | | -# out_path <- withr::local_tempdir() |
435 | | -# sink_ipc(my_lf, partition_parted(out_path, by = c("am", "cyl")), mkdir = TRUE) |
436 | 364 | sink_ipc <- function( |
437 | 365 | .data, |
438 | 366 | path, |
@@ -520,22 +448,6 @@ sink_ipc <- function( |
520 | 448 | #' out_path <- withr::local_tempdir() |
521 | 449 | #' sink_ndjson(my_lf, partition_by_max_size(out_path, max_size = 5), mkdir = TRUE) |
522 | 450 | #' fs::dir_tree(out_path) # mtcars has 32 rows so we have 7 output files |
523 | | - |
524 | | -# TODO: add this back when https://github.com/pola-rs/r-polars/issues/1522 is |
525 | | -# solved |
526 | | -# |
527 | | -# # Split the LazyFrame by pre-sorted data: |
528 | | -# out_path <- withr::local_tempdir() |
529 | | -# my_lf |> |
530 | | -# arrange(am, cyl) |> |
531 | | -# sink_ndjson(partition_parted(out_path, by = c("am", "cyl")), mkdir = TRUE) |
532 | | -# |
533 | | -# fs::dir_tree(out_path) |
534 | | -# |
535 | | -# # Careful when using partition_parted(): if the data is not presorted then |
536 | | -# # the output files may be incorrect! |
537 | | -# out_path <- withr::local_tempdir() |
538 | | -# sink_ndjson(my_lf, partition_parted(out_path, by = c("am", "cyl")), mkdir = TRUE) |
539 | 451 | sink_ndjson <- function( |
540 | 452 | .data, |
541 | 453 | path, |
@@ -572,74 +484,3 @@ sink_ndjson <- function( |
572 | 484 | mkdir = mkdir |
573 | 485 | ) |
574 | 486 | } |
575 | | - |
576 | | -#' Helper functions to export a LazyFrame as a partitioned output |
577 | | -#' |
578 | | -#' `r lifecycle::badge("experimental")` |
579 | | -#' More details and examples in the documentation of `sink_*()` functions. |
580 | | -#' |
581 | | -#' @inheritParams rlang::args_dots_empty |
582 | | -#' @param base_path The base path for the output files. Use the `mkdir` option |
583 | | -#' of the `sink_*` methods to ensure directories in the path are created. |
584 | | -#' @param by Something can be coerced to a list of Polars expressions. Used to |
585 | | -#' partition by. |
586 | | -#' @param include_key If `TRUE` (default), include the key columns in the output |
587 | | -#' files. |
588 | | -#' @param per_partition_sort_by Something can be coerced to a list of Polars |
589 | | -#' expressions, or `NULL` (default). Used to sort over within each partition. |
590 | | -#' Note that this might increase the memory consumption needed for each partition. |
591 | | -#' @param max_size An integer-ish value indicating the maximum number of rows in |
592 | | -#' each of the generated files. |
593 | | -#' |
594 | | -#' @name partitioned_output |
595 | | -#' @export |
596 | | -partition_by_key <- function( |
597 | | - base_path, |
598 | | - ..., |
599 | | - by, |
600 | | - include_key = TRUE, |
601 | | - per_partition_sort_by = NULL |
602 | | -) { |
603 | | - check_dots_empty() |
604 | | - pl$PartitionByKey( |
605 | | - base_path = base_path, |
606 | | - by = by, |
607 | | - include_key = TRUE, |
608 | | - per_partition_sort_by = NULL |
609 | | - ) |
610 | | -} |
611 | | - |
612 | | -#' @rdname partitioned_output |
613 | | -#' @export |
614 | | -partition_by_max_size <- function( |
615 | | - base_path, |
616 | | - ..., |
617 | | - max_size, |
618 | | - per_partition_sort_by = NULL |
619 | | -) { |
620 | | - check_dots_empty() |
621 | | - pl$PartitionMaxSize( |
622 | | - base_path = base_path, |
623 | | - max_size = max_size, |
624 | | - per_partition_sort_by = NULL |
625 | | - ) |
626 | | -} |
627 | | - |
628 | | -# TODO: add this back when https://github.com/pola-rs/r-polars/issues/1522 is |
629 | | -# solved |
630 | | -# @export |
631 | | -partition_parted <- function( |
632 | | - base_path, |
633 | | - ..., |
634 | | - by, |
635 | | - include_key = TRUE, |
636 | | - per_partition_sort_by = NULL |
637 | | -) { |
638 | | - check_dots_empty() |
639 | | - pl$PartitionParted( |
640 | | - base_path = base_path, |
641 | | - by = by, |
642 | | - include_key = TRUE, |
643 | | - per_partition_sort_by = NULL |
644 | | - ) |
645 | | -} |
0 commit comments