fuzzyjoin: Join Tables Together on Inexact Matching
厳密でないテーブルの結合を実行する
- CRAN: http://cran.r-project.org/web/packages/fuzzyjoin/index.html
- Vignettes: Example of stringdist_inner_join: Correcting misspellings against a dictionary
- GitHub: https://github.com/dgrtwo/fuzzyjoin
> library(fuzzyjoin)
> data("misspellings")
バージョン: 0.1.2
関数名 | 概略 |
---|---|
difference_join |
Join two tables based on absolute difference between their columns |
distance_join |
Join two tables based on a distance metric of one or more columns |
fuzzy_join |
Join two tables based not on exact matches, but rather with a function describing whether two vectors are matched or not |
geo_join |
Join two tables based on a geo distance of longitudes and latitudes |
misspellings |
A corpus of common misspellings, for examples and practice |
regex_join |
Join two tables based on a regular expression in one column matching the other |
stringdist_join |
Join two tables based on fuzzy string matching of their columns |
geo_join
緯度経度にもとづく地点間の結合
> library(dplyr)
> data("state")
>
> states <- data_frame(state = state.name,
+ longitude = state.center$x,
+ latitude = state.center$y)
>
> s1 <- rename(states, state1 = state)
> s2 <- rename(states, state2 = state)
>
> s1 %>%
+ geo_inner_join(s2, max_dist = 200, distance_col = "distance") %>%
+ filter(state1 != state2)
Joining by: c("longitude", "latitude")
Warning in data.matrix(data): NAs introduced by coercion
Warning in data.matrix(data): NAs introduced by coercion
Error in filter(., state1 != state2): object 'state1' not found
> d1 <- data_frame(
+ pref = c("東京都", "富山県", "北海道"),
+ lat = c(35.689488, 36.695291, 43.064615),
+ lon = c(139.691706, 137.211338, 141.346807)
+ )
>
> d2 <- data_frame(
+ pref2 = c("東京都", "新潟県"),
+ lat = c(35.689488, 37.902552),
+ lon = c(139.691706, 139.023095)
+ )
>
> d2 %>%
+ geo_inner_join(d1, max_dist = 10000, distance_col = "distance",
+ method = "geo", unit = "km") %>%
+ arrange(distance)
Joining by: c("lat", "lon")
# A tibble: 6 × 7
pref2 lat.x lon.x pref lat.y lon.y distance
<chr> <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
1 東京都 35.68949 139.6917 東京都 35.68949 139.6917 0.0000
2 新潟県 37.90255 139.0231 富山県 36.69529 137.2113 209.1699
3 東京都 35.68949 139.6917 富山県 36.69529 137.2113 249.4418
4 新潟県 37.90255 139.0231 東京都 35.68949 139.6917 252.7354
5 新潟県 37.90255 139.0231 北海道 43.06462 141.3468 606.0606
6 東京都 35.68949 139.6917 北海道 43.06462 141.3468 831.0837
misspellings
> misspellings
# A tibble: 4,505 × 2
misspelling correct
<chr> <chr>
1 abandonned abandoned
2 aberation aberration
3 abilties abilities
4 abilty ability
5 abondon abandon
6 abbout about
7 abotu about
8 abouta about a
9 aboutit about it
10 aboutthe about the
# ... with 4,495 more rows
stringdist_join
Arguments
- x
- y
- by
- max_dist
- method
- mode
- ignore_case
- distance_col
- ...
> data("DICTIONARY", package = "qdapDictionaries")
> words <- DICTIONARY
>
> set.seed(71)
> sub_misspellings <- misspellings %>% dplyr::sample_n(1000)
>
> sub_misspellings %>%
+ stringdist_inner_join(words, by = c(misspelling = "word"), max_dist = 1)
# A tibble: 701 × 4
misspelling correct word syllables
<chr> <chr> <chr> <dbl>
1 masterbation masturbation masturbation 4
2 elphant elephant elephant 3
3 unilateraly unilaterally unilateral 5
4 auxillary auxiliary auxiliary 4
5 borke broke bore 1
6 borke broke borne 1
7 borke broke broke 1
8 relevence relevance relevance 3
9 devide divide decide 2
10 devide divide deride 2
# ... with 691 more rows
> sub_misspellings %>%
+ stringdist_inner_join(words, by = c(misspelling = "word"), method = "soundex")
# A tibble: 16,103 × 4
misspelling correct word syllables
<chr> <chr> <chr> <dbl>
1 emmisaries emissaries emigrate 3
2 emmisaries emissaries emigration 4
3 emmisaries emissaries emigre 3
4 emmisaries emissaries emissary 4
5 emmisaries emissaries encircle 3
6 emmisaries emissaries encore 2
7 emmisaries emissaries encourage 3
8 emmisaries emissaries engraft 2
9 emmisaries emissaries engross 2
10 emmisaries emissaries enquire 2
# ... with 16,093 more rows