multidplyr: Partitioned data frames for 'dplyr'
{dplyr}
パッケージの処理をマルチコア実装
- CRAN: http://cran.r-project.org/web/packages/multidplyr/index.html
- GitHub: https://github.com/hadley/multidplyr
- Vignettes:
> library(multidplyr)
バージョン: 0.0.0.9000
関数名 | 概略 |
---|---|
cluster_call |
Call a function on each node of a cluster |
cluster_eval |
Evaluate arbitrary code on each node |
cluster_library |
Attach a library on each node. |
create_cluster |
Create a new cluster with sensible defaults. |
default_cluster |
Cluster management. |
objman |
Object management |
partition |
Partition data across a cluster. |
src_cluster |
A cluster. |
cluster_call
> cl <- get_default_cluster()
Initialising 7 core cluster.
> f <- function() 0
> cl %>% cluster_call(f)
[[1]]
[1] 0
[[2]]
[1] 0
[[3]]
[1] 0
[[4]]
[1] 0
[[5]]
[1] 0
[[6]]
[1] 0
[[7]]
[1] 0
cluster_eval
Arguments
- cluster
- expr
> cl <- get_default_cluster()
>
> cl %>% cluster_eval(expr = 1 + 1)
[[1]]
[1] 2
[[2]]
[1] 2
[[3]]
[1] 2
[[4]]
[1] 2
[[5]]
[1] 2
[[6]]
[1] 2
[[7]]
[1] 2
> cl %>% cluster_eval_(quote(1 + 1))
[[1]]
[1] 2
[[2]]
[1] 2
[[3]]
[1] 2
[[4]]
[1] 2
[[5]]
[1] 2
[[6]]
[1] 2
[[7]]
[1] 2
cluster_library
> cl <- create_cluster(2)
Initialising 2 core cluster.
> cl %>% cluster_library("magrittr") %>%
+ cluster_eval(search())
[[1]]
[1] ".GlobalEnv" "package:magrittr" "package:stats"
[4] "package:graphics" "package:grDevices" "package:utils"
[7] "package:datasets" "JapanEnv" "package:methods"
[10] "Autoloads" "package:base"
[[2]]
[1] ".GlobalEnv" "package:magrittr" "package:stats"
[4] "package:graphics" "package:grDevices" "package:utils"
[7] "package:datasets" "JapanEnv" "package:methods"
[10] "Autoloads" "package:base"
create_cluster
> create_cluster(2)
Initialising 2 core cluster.
socket cluster with 2 nodes on host 'localhost'
get_default_cluster
クラスタ管理
> (cl <- get_default_cluster())
socket cluster with 7 nodes on host 'localhost'
objman
オブジェクト管理
partition
> data("flights", package = "nycflights13")
> flights %>% dplyr::glimpse()
Observations: 336,776
Variables: 19
$ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013,...
$ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
$ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
$ dep_time <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 55...
$ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 60...
$ dep_delay <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2...
$ arr_time <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 7...
$ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 7...
$ arr_delay <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -...
$ carrier <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV",...
$ flight <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79...
$ tailnum <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN...
$ origin <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR"...
$ dest <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL"...
$ air_time <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138...
$ distance <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 94...
$ hour <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5,...
$ minute <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, ...
$ time_hour <time> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013...
> flights1 <- partition(flights, flight)
> flights2 <- dplyr::summarise(flights1, dep_delay = mean(dep_delay, na.rm = TRUE))
> flights3 <- dplyr::collect(flights2)