janitor: Utilities for Data Cleaning

> library(janitor)

バージョン: 0.2.1


関数名 概略
add_totals_col Append a totals column to a data.frame.
add_totals_row Append a totals row to a data.frame.
adorn_crosstab Add formatting to a crosstabulation table.
clean_names Cleans names of a data.frame.
convert_to_NA Convert string values to true 'NA' values.
crosstab Generate a crosstabulation of two vectors.
excel_numeric_to_date Convert dates encoded as serial numbers to Date class.
get_dupes Get rows of a 'data.frame' with identical values for the specified variables.
janitor janitor
ns_to_percents Convert a numeric data.frame to row-, column-, or totals-wise percentages.
remove_empty_cols Removes empty columns from a data.frame.
remove_empty_rows Removes empty rows from a data.frame.
tabyl Generate a frequency table from a vector.
top_levels Generate a frequency table of a factor grouped into top-n, bottom-n, and all other levels.
use_first_valid_of Returns first non-NA value from a set of vectors.

add_totals_col

複数行から合計の値を求める

> mtcars %>%
+   crosstab(am, cyl) %>%
+   add_totals_col()
  am 4 6  8 Total
1  0 3 4 12    19
2  1 8 3  2    13

add_totals_row

> iris %>% 
+   head() %>% 
+   select(-Species) %>% 
+   add_totals_row()
  Sepal.Length Sepal.Width Petal.Length Petal.Width
1          5.1         3.5          1.4         0.2
2          4.9         3.0          1.4         0.2
3          4.7         3.2          1.3         0.2
4          4.6         3.1          1.5         0.2
5            5         3.6          1.4         0.2
6          5.4         3.9          1.7         0.4
7        Total        20.3          8.7         1.4

adorn_crosstab

> mtcars %>%
+  crosstab(gear, cyl) %>%
+  adorn_crosstab(denom = "all")
  gear         4         6          8
1    3  3.1% (1)  6.2% (2) 37.5% (12)
2    4 25.0% (8) 12.5% (4)  0.0%  (0)
3    5  6.2% (2)  3.1% (1)  6.2%  (2)

clean_names

> names(iris)
[1] "Sepal.Length" "Sepal.Width"  "Petal.Length" "Petal.Width" 
[5] "Species"
> clean_names(iris) %>% names()
[1] "sepal_length" "sepal_width"  "petal_length" "petal_width" 
[5] "species"
> clean_names(trees) %>% names()
[1] "girth"  "height" "volume"

convert_to_NA

特定の値をNAに置換する

> convert_to_NA(mtcars, "4")
> convert_to_NA(letters, c("b", "d"))

crosstab

2つのベクトルからクロステーブルを作成する

> crosstab(mtcars$cyl, mtcars$gear)
  mtcars$cyl  3 4 5
1          4  1 8 2
2          6  2 4 1
3          8 12 0 2
> mtcars %>% crosstab(cyl, gear, percent = "row")
  cyl               3              4              5
1   4 0.0909090909091 0.727272727273 0.181818181818
2   6 0.2857142857143 0.571428571429 0.142857142857
3   8 0.8571428571429 0.000000000000 0.142857142857

excel_numeric_to_date

> excel_numeric_to_date(40000)
[1] "2009-07-06"

get_dupes

特定の列で重複のある行の抽出

> get_dupes(mtcars, mpg, hp)
# A tibble: 2 × 12
    mpg    hp dupe_count   cyl  disp  drat    wt  qsec    vs    am  gear
  <dbl> <dbl>      <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1    21   110          2     6   160   3.9 2.620 16.46     0     1     4
2    21   110          2     6   160   3.9 2.875 17.02     0     1     4
# ... with 1 more variables: carb <dbl>