Similarity Metrics Evaluation

In the mosbi package, similarities between biclusters are computed using different possible similarity metrics.

This vignette gives an overview about the implemented metrics.

library(mosbi)

The following similarity metrics are currently implemented:

# Bray-Curtis similarity
bray_curtis <- function(s1, s2, overlap) {
    return(((2 * overlap) / (s1 + s2)))
}

# Jaccard index
jaccard <- function(s1, s2, overlap) {
    return(((overlap) / (s1 + s2 - overlap)))
}

# overlap coefficient
overlap <- function(s1, s2, overlap) {
    return((overlap / min(s1, s2)))
}

# Fowlkes–Mallows index
folkes_mallows <- function(s1, s2, overlap) {
    tp <- choose(overlap, 2)
    fp <- choose(s1 - overlap, 2)
    fn <- choose(s2 - overlap, 2)

    return(sqrt((tp / (tp + fp)) * (tp / (tp + fn))))
}

The behavior of the similarity metrics will be evaluated for two scenarios:

  • Two biclusters of the same size with an increasing overlap.

  • Two biclusters of different sizes (One twice as big as the other) with an increasing overlap.

# Scenario 1 - two biclusters of the same size
size1_1 <- rep(1000, 1000)
size2_1 <- rep(1000, 1000)
overlap_1 <- seq(1, 1000)

# Scenario 2 - two biclusters one of size 500, the other of size 1000
size1_2 <- rep(1000, 500)
size2_2 <- rep(500, 500)
overlap_2 <- seq(1, 500)

Two biclusters of the same size:

plot(overlap_1, bray_curtis(size1_1, size2_1, overlap_1),
    col = "red", type = "l", xlab = "Overlap", ylab = "Similarity", 
    ylim = c(0, 1)
)
lines(overlap_1, jaccard(size1_1, size2_1, overlap_1), col = "blue")
lines(overlap_1, overlap(size1_1, size2_1, overlap_1), col = "green", lty = 2)
lines(overlap_1, folkes_mallows(size1_1, size2_1, overlap_1), col = "orange")
legend(
    x = .8, legend = c("Bray-Curtis", "Jaccard", "Overlap", "Fowlkes–Mallows"),
    col = c("red", "blue", "green", "orange"),
    lty = 1, cex = 0.8, title = "Similarity metrics"
)

Two biclusters of different sizes:

plot(overlap_2, bray_curtis(size1_2, size2_2, overlap_2),
    col = "red", type = "l", xlab = "Overlap", ylab = "Similarity", 
    ylim = c(0, 1)
)
lines(overlap_2, jaccard(size1_2, size2_2, overlap_2), col = "blue")
lines(overlap_2, overlap(size1_2, size2_2, overlap_2), col = "green")
lines(overlap_2, folkes_mallows(size1_2, size2_2, overlap_2), col = "orange")
legend(
    x = .8, legend = c("Bray-Curtis", "Jaccard", "Overlap", "Fowlkes–Mallows"),
    col = c("red", "blue", "green", "orange"),
    lty = 1, cex = 0.8, title = "Similarity metrics"
)

Session Info

sessionInfo()
#> R version 4.4.1 (2024-06-14)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 24.04 LTS
#> 
#> Matrix products: default
#> BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so;  LAPACK version 3.12.0
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=C              
#>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
#> 
#> time zone: Etc/UTC
#> tzcode source: system (glibc)
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#> [1] mosbi_1.11.0     BiocStyle_2.33.1
#> 
#> loaded via a namespace (and not attached):
#>  [1] tidyr_1.3.1             generics_0.1.3          sass_0.4.9             
#>  [4] utf8_1.2.4              class_7.3-22            lattice_0.22-6         
#>  [7] digest_0.6.36           magrittr_2.0.3          evaluate_0.24.0        
#> [10] grid_4.4.1              RColorBrewer_1.1-3      fastmap_1.2.0          
#> [13] jsonlite_1.8.8          BiocManager_1.30.23     purrr_1.0.2            
#> [16] fansi_1.0.6             scales_1.3.0            modeltools_0.2-23      
#> [19] jquerylib_0.1.4         cli_3.6.3               isa2_0.3.6             
#> [22] rlang_1.1.4             Biobase_2.65.0          munsell_0.5.1          
#> [25] cachem_1.1.0            yaml_2.3.10             tools_4.4.1            
#> [28] parallel_4.4.1          biclust_2.0.3.1         dplyr_1.1.4            
#> [31] colorspace_2.1-1        ggplot2_3.5.1           BiocGenerics_0.51.0    
#> [34] buildtools_1.0.0        vctrs_0.6.5             R6_2.5.1               
#> [37] stats4_4.4.1            lifecycle_1.0.4         QUBIC_1.33.0           
#> [40] MASS_7.3-61             pkgconfig_2.0.3         RcppParallel_5.1.8     
#> [43] bslib_0.8.0             pillar_1.9.0            gtable_0.3.5           
#> [46] glue_1.7.0              Rcpp_1.0.13             tidyselect_1.2.1       
#> [49] xfun_0.46               tibble_3.2.1            highr_0.11             
#> [52] sys_3.4.2               flexclust_1.4-2         knitr_1.48             
#> [55] fabia_2.51.0            igraph_2.0.3            htmltools_0.5.8.1      
#> [58] rmarkdown_2.27          BH_1.84.0-0             maketools_1.3.0        
#> [61] compiler_4.4.1          additivityTests_1.1-4.2