multidplyr: Partitioned data frames for 'dplyr'

{dplyr}パッケージの処理をマルチコア実装

> library(multidplyr)

バージョン: 0.0.0.9000


関数名 概略
cluster_call Call a function on each node of a cluster
cluster_eval Evaluate arbitrary code on each node
cluster_library Attach a library on each node.
create_cluster Create a new cluster with sensible defaults.
default_cluster Cluster management.
objman Object management
partition Partition data across a cluster.
src_cluster A cluster.

cluster_call

> cl <- get_default_cluster()
Initialising 7 core cluster.
> f <- function() 0
> cl %>% cluster_call(f)
[[1]]
[1] 0

[[2]]
[1] 0

[[3]]
[1] 0

[[4]]
[1] 0

[[5]]
[1] 0

[[6]]
[1] 0

[[7]]
[1] 0

cluster_eval

Arguments

  • cluster
  • expr
> cl <- get_default_cluster()
> 
> cl %>% cluster_eval(expr = 1 + 1)
[[1]]
[1] 2

[[2]]
[1] 2

[[3]]
[1] 2

[[4]]
[1] 2

[[5]]
[1] 2

[[6]]
[1] 2

[[7]]
[1] 2
> cl %>% cluster_eval_(quote(1 + 1))
[[1]]
[1] 2

[[2]]
[1] 2

[[3]]
[1] 2

[[4]]
[1] 2

[[5]]
[1] 2

[[6]]
[1] 2

[[7]]
[1] 2

cluster_library

> cl <- create_cluster(2)
Initialising 2 core cluster.
> cl %>% cluster_library("magrittr") %>% 
+   cluster_eval(search())
[[1]]
 [1] ".GlobalEnv"        "package:magrittr"  "package:stats"    
 [4] "package:graphics"  "package:grDevices" "package:utils"    
 [7] "package:datasets"  "JapanEnv"          "package:methods"  
[10] "Autoloads"         "package:base"     

[[2]]
 [1] ".GlobalEnv"        "package:magrittr"  "package:stats"    
 [4] "package:graphics"  "package:grDevices" "package:utils"    
 [7] "package:datasets"  "JapanEnv"          "package:methods"  
[10] "Autoloads"         "package:base"

create_cluster

> create_cluster(2)
Initialising 2 core cluster.
socket cluster with 2 nodes on host 'localhost'

get_default_cluster

クラスタ管理

> (cl <- get_default_cluster())
socket cluster with 7 nodes on host 'localhost'

objman

オブジェクト管理

partition

> data("flights", package = "nycflights13")
> flights %>% dplyr::glimpse()
Observations: 336,776
Variables: 19
$ year           <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013,...
$ month          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
$ day            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
$ dep_time       <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 55...
$ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 60...
$ dep_delay      <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2...
$ arr_time       <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 7...
$ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 7...
$ arr_delay      <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -...
$ carrier        <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV",...
$ flight         <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79...
$ tailnum        <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN...
$ origin         <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR"...
$ dest           <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL"...
$ air_time       <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138...
$ distance       <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 94...
$ hour           <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5,...
$ minute         <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, ...
$ time_hour      <time> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013...
> flights1 <- partition(flights, flight)
> flights2 <- dplyr::summarise(flights1, dep_delay = mean(dep_delay, na.rm = TRUE))
> flights3 <- dplyr::collect(flights2)