fuzzyjoin: Join Tables Together on Inexact Matching

厳密でないテーブルの結合を実行する

> library(fuzzyjoin)
> data("misspellings")

バージョン: 0.1.2


関数名 概略
difference_join Join two tables based on absolute difference between their columns
distance_join Join two tables based on a distance metric of one or more columns
fuzzy_join Join two tables based not on exact matches, but rather with a function describing whether two vectors are matched or not
geo_join Join two tables based on a geo distance of longitudes and latitudes
misspellings A corpus of common misspellings, for examples and practice
regex_join Join two tables based on a regular expression in one column matching the other
stringdist_join Join two tables based on fuzzy string matching of their columns

geo_join

緯度経度にもとづく地点間の結合

> library(dplyr)
> data("state")
> 
> states <- data_frame(state = state.name,
+                      longitude = state.center$x,
+                      latitude = state.center$y)
> 
> s1 <- rename(states, state1 = state)
> s2 <- rename(states, state2 = state)
> 
> s1 %>%
+  geo_inner_join(s2, max_dist = 200, distance_col = "distance") %>%
+  filter(state1 != state2)
Joining by: c("longitude", "latitude")
Warning in data.matrix(data): NAs introduced by coercion

Warning in data.matrix(data): NAs introduced by coercion
Error in filter(., state1 != state2): object 'state1' not found
> d1 <- data_frame(
+   pref = c("東京都", "富山県", "北海道"),
+   lat = c(35.689488, 36.695291, 43.064615),
+   lon = c(139.691706, 137.211338, 141.346807)
+ )
> 
> d2 <- data_frame(
+   pref2 = c("東京都", "新潟県"),
+   lat = c(35.689488, 37.902552),
+   lon = c(139.691706, 139.023095)
+ )
> 
> d2 %>%
+   geo_inner_join(d1, max_dist = 10000, distance_col = "distance",
+                  method = "geo", unit = "km") %>% 
+   arrange(distance)
Joining by: c("lat", "lon")
# A tibble: 6 × 7
   pref2    lat.x    lon.x   pref    lat.y    lon.y distance
   <chr>    <dbl>    <dbl>  <chr>    <dbl>    <dbl>    <dbl>
1 東京都 35.68949 139.6917 東京都 35.68949 139.6917   0.0000
2 新潟県 37.90255 139.0231 富山県 36.69529 137.2113 209.1699
3 東京都 35.68949 139.6917 富山県 36.69529 137.2113 249.4418
4 新潟県 37.90255 139.0231 東京都 35.68949 139.6917 252.7354
5 新潟県 37.90255 139.0231 北海道 43.06462 141.3468 606.0606
6 東京都 35.68949 139.6917 北海道 43.06462 141.3468 831.0837

misspellings

> misspellings
# A tibble: 4,505 × 2
   misspelling    correct
         <chr>      <chr>
1   abandonned  abandoned
2    aberation aberration
3     abilties  abilities
4       abilty    ability
5      abondon    abandon
6       abbout      about
7        abotu      about
8       abouta    about a
9      aboutit   about it
10    aboutthe  about the
# ... with 4,495 more rows

stringdist_join

Arguments

  • x
  • y
  • by
  • max_dist
  • method
  • mode
  • ignore_case
  • distance_col
  • ...
> data("DICTIONARY", package = "qdapDictionaries")
> words <- DICTIONARY
> 
> set.seed(71)
> sub_misspellings <- misspellings %>% dplyr::sample_n(1000)
> 
> sub_misspellings %>%
+   stringdist_inner_join(words, by = c(misspelling = "word"), max_dist = 1)
# A tibble: 701 × 4
    misspelling      correct         word syllables
          <chr>        <chr>        <chr>     <dbl>
1  masterbation masturbation masturbation         4
2       elphant     elephant     elephant         3
3   unilateraly unilaterally   unilateral         5
4     auxillary    auxiliary    auxiliary         4
5         borke        broke         bore         1
6         borke        broke        borne         1
7         borke        broke        broke         1
8     relevence    relevance    relevance         3
9        devide       divide       decide         2
10       devide       divide       deride         2
# ... with 691 more rows
> sub_misspellings %>%
+   stringdist_inner_join(words, by = c(misspelling = "word"), method = "soundex")
# A tibble: 16,103 × 4
   misspelling    correct       word syllables
         <chr>      <chr>      <chr>     <dbl>
1   emmisaries emissaries   emigrate         3
2   emmisaries emissaries emigration         4
3   emmisaries emissaries     emigre         3
4   emmisaries emissaries   emissary         4
5   emmisaries emissaries   encircle         3
6   emmisaries emissaries     encore         2
7   emmisaries emissaries  encourage         3
8   emmisaries emissaries    engraft         2
9   emmisaries emissaries    engross         2
10  emmisaries emissaries    enquire         2
# ... with 16,093 more rows