proxy: Distance and Similarity Measures

距離と類似度の測定

> library(proxy)

Attaching package: 'proxy'
The following objects are masked from 'package:stats':

    as.dist, dist
The following object is masked from 'package:base':

    as.matrix

バージョン: 0.4.15


.
dist Matrix Distance/Similarity Computation
pr_DB Registry of proximities
rowSums.dist Row Sums/Means of Sparse Symmetric Matrices
関数名 概略
dist Matrix Distance/Similarity Computation
pr_DB Registry of proximities
rowSums.dist Row Sums/Means of Sparse Symmetric Matrices

dist / simil

ref) base::dist()

Arguments

  • x
  • y
  • method... Euclidean ....
  • diag
  • upper
  • pairwise
  • by_rows
  • convert_similarities, convert_distances
  • auto_convert_data_frames
  • FUN
  • ...
> dist
function (x, y = NULL, method = NULL, ..., diag = FALSE, upper = FALSE, 
    pairwise = FALSE, by_rows = TRUE, convert_similarities = TRUE, 
    auto_convert_data_frames = TRUE) 
{
    if ((is.function(y) || is.character(y)) && is.null(method)) {
        method <- y
        y <- NULL
    }
    is.n_l_c <- function(x) all(sapply(x, is.numeric)) || all(sapply(x, 
        is.logical)) || all(sapply(x, is.complex))
    if (is.data.frame(x) && auto_convert_data_frames && is.n_l_c(x)) 
        x <- as.matrix(x)
    if (is.data.frame(y) && !is.null(y) && auto_convert_data_frames && 
        is.n_l_c(y)) 
        y <- as.matrix(y)
    if (is.vector(x) && is.atomic(x)) 
        x <- as.matrix(x)
    if (!is.null(y) && is.vector(y) && is.atomic(y)) 
        y <- as.matrix(y)
    reg_entry <- NULL
    if (is.null(method)) 
        method <- if (is.data.frame(x)) 
            "Gower"
        else if (is.logical(x)) 
            "Jaccard"
        else "Euclidean"
    if (!is.function(method)) 
        reg_entry <- if (inherits(method, "proxy_registry_entry")) 
            method
        else pr_DB$get_entry(method)
    if (!is.data.frame(x) && !is.matrix(x) && !is.list(x)) 
        stop("Can only handle data frames, vectors, matrices, and lists!")
    if (is.data.frame(x) && !by_rows) 
        stop("Cannot transpose mixed data frames")
    if (!is.null(y)) {
        if (is.data.frame(x) && !is.data.frame(y) || is.matrix(x) && 
            !is.matrix(y) || is.list(x) && !is.list(y)) 
            stop("x and y must be of same type.")
        if (is.matrix(x) && is.matrix(y) || is.data.frame(x) && 
            is.data.frame(y)) 
            if (by_rows && (ncol(x) != ncol(y))) 
                stop("x and y must be conform in columns.")
            else if (!by_rows && (nrow(x) != nrow(y))) 
                stop("x and y must be conform in rows.")
    }
    params <- list(...)
    if (!is.null(reg_entry)) {
        if (!is.na(reg_entry$PREFUN)) {
            tmp <- do.call(reg_entry$PREFUN, c(list(x, y, pairwise, 
                params, reg_entry)))
            if (!is.null(tmp)) {
                x <- tmp$x
                y <- tmp$y
                pairwise <- tmp$pairwise
                params <- tmp$p
                reg_entry <- tmp$reg_entry
            }
        }
        method <- reg_entry$FUN
    }
    .proxy_external <- function(CFUN, x, y) do.call(".External", 
        c(list(CFUN, x, y, pairwise, if (!is.function(method)) get(method) else method), 
            params))
    result <- if (!is.null(reg_entry) && !reg_entry$loop) {
        if (!by_rows && !is.list(x)) {
            x <- t(x)
            if (!is.null(y)) 
                y <- t(y)
        }
        if (reg_entry$C_FUN) 
            do.call(".Call", c(list(method), list(x), list(y), 
                pairwise, params, list(PACKAGE = reg_entry$PACKAGE)))
        else {
            if (!is.null(reg_entry$PACKAGE)) 
                do.call(method, c(list(x), list(y), params), 
                  envir = asNamespace(reg_entry$PACKAGE))
            else do.call(method, c(list(x), list(y), params))
        }
    }
    else if (is.null(y)) {
        if (!by_rows && !is.list(x)) 
            x <- t(x)
        if (is.list(x) && !is.null(reg_entry) && reg_entry$abcd) 
            x <- do.call("rbind", x)
        if (is.matrix(x) && !is.null(reg_entry) && reg_entry$abcd) 
            .proxy_external(R_apply_dist_binary_matrix, x != 
                0, NULL)
        else if (is.matrix(x)) 
            .proxy_external(R_apply_dist_matrix, x, NULL)
        else if (is.list(x) && !(is.data.frame(x) && by_rows)) 
            .proxy_external(R_apply_dist_list, x, NULL)
        else .proxy_external(R_apply_dist_data_frame, x, NULL)
    }
    else {
        if (!by_rows && !is.list(x)) {
            x <- t(x)
            y <- t(y)
        }
        if (is.list(x) && !is.null(reg_entry) && reg_entry$abcd) {
            x <- do.call("rbind", x)
            y <- do.call("rbind", x)
        }
        if (is.matrix(x) && !is.null(reg_entry) && reg_entry$abcd) 
            .proxy_external(R_apply_dist_binary_matrix, x != 
                0, y != 0)
        else if (is.matrix(x)) 
            .proxy_external(R_apply_dist_matrix, x, y)
        else if (is.list(x) && !(is.data.frame(x) && by_rows)) 
            .proxy_external(R_apply_dist_list, x, y)
        else .proxy_external(R_apply_dist_data_frame, x, y)
    }
    if (is.matrix(result) && is.null(dimnames(result))) 
        if (is.list(x) && !is.data.frame(x)) {
            rownames(result) <- names(x)
            colnames(result) <- names(y)
        }
        else if (by_rows) {
            rownames(result) <- rownames(x)
            colnames(result) <- rownames(y)
        }
        else {
            rownames(result) <- colnames(x)
            colnames(result) <- colnames(y)
        }
    if (!is.null(reg_entry)) {
        if (!is.na(reg_entry$POSTFUN)) 
            result <- do.call(reg_entry$POSTFUN, c(list(result, 
                params)))
        if (!reg_entry$distance && !(is.logical(convert_similarities) && 
            !convert_similarities)) {
            result <- if (is.function(convert_similarities) || 
                is.character(convert_similarities)) 
                do.call(convert_similarities, list(result))
            else if (is.null(reg_entry$convert)) 
                pr_simil2dist(result)
            else do.call(reg_entry$convert, list(result))
        }
        method <- reg_entry$names[1]
    }
    result <- if (is.matrix(result)) 
        structure(result, class = "crossdist")
    else if (inherits(result, "dist")) 
        structure(result, Diag = diag, Upper = upper)
    else structure(result, class = "pairdist")
    structure(result, method = if (is.character(method)) 
        method
    else if (missing(method)) 
        deparse(substitute(y))
    else deparse(substitute(method)), call = match.call())
}
<environment: namespace:proxy>
> sample(c(FALSE, TRUE), 8, rep = TRUE) %>% matrix(ncol = 2) %>% {
+   dist(., method = "Jaccard") %>% print()
+   dist(., method = "")
+ }
    1   2   3
2 1.0        
3 0.0 1.0    
4 1.0 0.5 1.0
Error in .get_entry_index(name, stop_if_missing): Entry "" not in registry.

pr_DB

> pr_DB
An object of class "registry" with 48 entries.
> summary(pr_DB)
* Similarity measures:
Braun-Blanquet, Chi-squared, correlation, cosine, Cramer, Dice,
eJaccard, Fager, Faith, Gower, Hamman, Jaccard, Kulczynski1,
Kulczynski2, Michael, Mountford, Mozley, Ochiai, Pearson, Phi,
Phi-squared, Russel, simple matching, Simpson, Stiles, Tanimoto,
Tschuprow, Yule, Yule2

* Distance measures:
Bhjattacharyya, Bray, Canberra, Chord, divergence, Euclidean,
fJaccard, Geodesic, Hellinger, Kullback, Levenshtein, Mahalanobis,
Manhattan, Minkowski, Podani, Soergel, supremum, Wave, Whittaker
> pr_DB$get_entry("Jaccard")
      names Jaccard, binary, Reyssac, Roux
        FUN R_bjaccard
   distance FALSE
     PREFUN pr_Jaccard_prefun
    POSTFUN NA
    convert pr_simil2dist
       type binary
       loop FALSE
      C_FUN TRUE
    PACKAGE proxy
       abcd FALSE
    formula a / (a + b + c)
  reference Jaccard, P. (1908). Nouvelles recherches sur la
            distribution florale. Bull. Soc. Vaud. Sci. Nat., 44,
            pp. 223--270.
description The Jaccard Similarity (C implementation) for binary
            data. It is the proportion of (TRUE, TRUE) pairs, but
            not considering (FALSE, FALSE) pairs. So it compares
            the intersection with the union of object sets.

rowSums.dist / rowMeans.dist / colSums.dist / colMeans.dist

> runif(10 * 2) %>% matrix(ncol = 2) %>% dist() %>% 
+   as.matrix() %>% {
+     print(.)
+     rowSums(.)
+   }
           1         2         3         4         5         6         7
1  0.0000000 0.5604138 0.3630898 0.4007057 0.3708736 0.5757461 0.4191103
2  0.5604138 0.0000000 0.3948037 0.4710453 0.6714994 0.6890793 0.9768773
3  0.3630898 0.3948037 0.0000000 0.5868850 0.2830718 0.8169070 0.7002736
4  0.4007057 0.4710453 0.5868850 0.0000000 0.7372394 0.2356526 0.7511751
5  0.3708736 0.6714994 0.2830718 0.7372394 0.0000000 0.9384471 0.5173843
6  0.5757461 0.6890793 0.8169070 0.2356526 0.9384471 0.0000000 0.8380427
7  0.4191103 0.9768773 0.7002736 0.7511751 0.5173843 0.8380427 0.0000000
8  0.4858666 0.8571609 0.8282448 0.4099925 0.8443178 0.3318496 0.5761292
9  0.2427433 0.5009358 0.4948941 0.1632721 0.5977090 0.3410606 0.5931067
10 0.5072823 0.8310136 0.4376350 0.8913188 0.1646215 1.0825972 0.5322357
           8         9        10
1  0.4858666 0.2427433 0.5072823
2  0.8571609 0.5009358 0.8310136
3  0.8282448 0.4948941 0.4376350
4  0.4099925 0.1632721 0.8913188
5  0.8443178 0.5977090 0.1646215
6  0.3318496 0.3410606 1.0825972
7  0.5761292 0.5931067 0.5322357
8  0.0000000 0.3640393 0.9548987
9  0.3640393 0.0000000 0.7447389
10 0.9548987 0.7447389 0.0000000
       1        2        3        4        5        6        7        8 
3.925832 5.952829 4.905805 4.647287 5.125164 5.849382 5.904335 5.652499 
       9       10 
4.042500 6.146342