Tranferable Omics Pediction

suppressPackageStartupMessages({
  library(tidyverse)
  library(survival)
  library(dplyr)
  library(survminer)
  library(Biobase)
  library(ggsci)
  library(ggbeeswarm)
  library(TOP)
  library(curatedOvarianData)
})

theme_set(theme_bw())

Installation

# Install the package from Bioconductor
if (!requireNamespace("BiocManager", quietly = TRUE)) {
  install.packages("BiocManager")
}
BiocManager::install("TOP")

Overview

The TOP R package provides a transfer learning approach for building predictive models across multiple omics datasets. With the increasing availability of omics data, there is a growing need for methods that can effectively integrate and analyze data from multiple sources. However, merging datasets can be challenging due to batch effects and other sources of variation.

TOP uses transfer learning strategies to build predictive models that can be applied across multiple datasets without the need for extensive batch correction methods. Specifically, TOP employs a lasso regression model to identify important features and construct a transferable predictive model. By leveraging information from multiple datasets, TOP can improve predictive accuracy and identify common biomarkers across different omics datasets.

The package provides several functions for building and evaluating transfer learning models, including options for visualizing results. Additionally, the package includes sample datasets and detailed documentation to help users get started with the package and understand the underlying methods.

Overall, TOP offers a flexible and powerful approach for integrating and analyzing omics data from multiple sources, making it a valuable tool for researchers in a variety of fields.

Loading example data

The example data used in this vignette is the curatedOvarianData. Described in the paper from Benjamin Frederick Ganzfried et al (2013)

data("GSE12418_eset", package = "curatedOvarianData")
data("GSE30009_eset", package = "curatedOvarianData")

Building a survival model.

The transferable omics prediction framework is also able to build survival models. In this section, we will demonstrate how to use TOP to build a survival model using example ovarian cancer datasets. First we utilise nine datasets that were preparred by Ganzfield et al. 

data("GSE32062.GPL6480_eset")
japan_a <- GSE32062.GPL6480_eset

data("GSE9891_eset")
tothill <- GSE9891_eset

data("GSE32063_eset")
japan_b <- GSE32063_eset

data("TCGA.RNASeqV2_eset")
selection <- TCGA.RNASeqV2_eset$tumorstage %in% c(3, 4) & TCGA.RNASeqV2_eset$site_of_tumor_first_recurrence == "metastasis"
selection[is.na(selection)] <- FALSE
tcgarnaseq <- TCGA.RNASeqV2_eset[, selection]

data("E.MTAB.386_eset")
bentink <- E.MTAB.386_eset

data("GSE13876_eset")
crijns <- GSE13876_eset
crijns <- crijns[, crijns$grade %in% c(3, 4)]

data("GSE18520_eset")
mok <- GSE18520_eset

data("GSE17260_eset")
yoshihara2010 <- GSE17260_eset

data("GSE26712_eset")
bonome <- GSE26712_eset

list_ovarian_eset <- lst(
  japan_a, tothill, japan_b,
  tcgarnaseq, bonome, mok, yoshihara2010,
  bentink, crijns
)

list_ovarian_eset %>%
  sapply(dim)
#>          japan_a tothill japan_b tcgarnaseq bonome   mok yoshihara2010 bentink
#> Features   20106   19816   20106      20502  13104 19816         20106   10357
#> Samples      260     285      40         51    195    63           110     129
#>          crijns
#> Features  20577
#> Samples      85

Common genes between datasets

In order to apply the TOP framework, it is important that the input matrices have identical feature names, such as gene names, across all datasets. In this example, we will identify the common genes present in all the datasets, as these will be the features used for transfer learning.

raw_gene_list <- purrr::map(list_ovarian_eset, rownames)
common_genes <- Reduce(f = intersect, x = raw_gene_list)
length(common_genes)
#> [1] 7517

Survival samples

Next, we will prepare the survival data from each of the ovarian cancer datasets. The survival data includes the survival time and the event status (i.e., whether the event of interest, such as death, has occurred)

ov_pdata <- purrr::map(list_ovarian_eset, pData)
list_pdata <- list_ovarian_eset %>%
  purrr::map(pData) %>%
  purrr::map(tibble::rownames_to_column, var = "sample_id")

ov_surv_raw <- purrr::map(
  .x = list_pdata,
  .f = ~ data.frame(
    sample_id = .x$sample_id,
    time = .x$days_to_death %>% as.integer(),
    dead = ifelse(.x$vital_status == "deceased", 1, 0)
  ) %>%
    na.omit() %>%
    dplyr::filter(
      time > 0,
      !is.nan(time),
      !is.nan(dead)
    )
)
ov_surv_raw %>% sapply(nrow)
#>       japan_a       tothill       japan_b    tcgarnaseq        bonome 
#>           260           276            40            51           185 
#>           mok yoshihara2010       bentink        crijns 
#>            53           110           129            85
ov_surv_y <- ov_surv_raw %>%
  purrr::map(~ .x %>%
    dplyr::select(-sample_id)) %>%
  purrr::map(~ Surv(time = .x$time, event = .x$dead))

Preparing data for modelling.

In this section, we will prepare the gene expression data and survival data for visualization. We will subset the gene expression data to include only the common genes and samples with survival information. Then, we will create a combined survival data table, with a data_source column to identify the origin of each sample. Finally we plot the distribution of survival times across datasets.

ov_surv_exprs <- purrr::map2(
  .x = list_ovarian_eset,
  .y = ov_surv_raw,
  .f = ~ exprs(.x[common_genes, .y$sample_id])
)

ov_surv_tbl <- ov_surv_raw %>%
  bind_rows(.id = "data_source")
ov_surv_tbl %>%
  ggplot(aes(
    x = time,
    y = ..density..,
    fill = data_source
  )) +
  geom_density(alpha = 0.25) +
  scale_fill_d3()
#> Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
#> ℹ Please use `after_stat(density)` instead.
#> This warning is displayed once every 8 hours.
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
#> generated.

# Building a TOP survival model. In this section, we will build the survival model using the TOP framework. To do this, we will first organize the survival data and gene expression data into appropriate data structures. Then, we will apply the TOP_survival function to build the model, specifying the number of features to be selected by the lasso regression.

surv_list <- ov_surv_tbl %>%
  split(ov_surv_tbl$data_source)
surv_list <- lapply(surv_list, function(x) x[, 3:4])

surv_counts <- ov_surv_exprs %>% lapply(t)
surv_list <- surv_list[names(surv_counts)]
surv_model <- TOP_survival(
  x_list = surv_counts, y_list = surv_list, nFeatures = 10
)

Visualising performance.

Once we have built the survival model using the TOP framework, it is important to evaluate its performance. In this section, we will visualize the performance of the model by calculating the concordance index (C-index) for each dataset. The C-index measures the agreement between the predicted and observed survival times, with values ranging from 0 to 1,

conf_results <- unlist(lapply(seq_along(surv_counts), function(x) {
  Surv_TOP_CI(
    surv_model,
    newx = surv_counts[[x]], newy = surv_list[[x]]
  )$concordance
}))

conf_results %>%
  tibble::enframe() %>%
  mutate(Metric = "C-index") %>%
  ggplot(aes(y = value, x = Metric)) +
  geom_boxplot(width = 0.5) +
  ylab("C-index") +
  geom_jitter(alpha = 0.7, width = 0.1) +
  theme(axis.text.x = element_blank()) +
  xlab("Survival Model")

sessionInfo

sessionInfo()
#> R version 4.4.1 (2024-06-14)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 24.04.1 LTS
#> 
#> Matrix products: default
#> BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so;  LAPACK version 3.12.0
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=C              
#>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
#> 
#> time zone: Etc/UTC
#> tzcode source: system (glibc)
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#>  [1] curatedOvarianData_1.43.0 TOP_1.7.0                
#>  [3] ggbeeswarm_0.7.2          ggsci_3.2.0              
#>  [5] Biobase_2.67.0            BiocGenerics_0.53.1      
#>  [7] generics_0.1.3            survminer_0.5.0          
#>  [9] ggpubr_0.6.0              survival_3.7-0           
#> [11] lubridate_1.9.3           forcats_1.0.0            
#> [13] stringr_1.5.1             dplyr_1.1.4              
#> [15] purrr_1.0.2               readr_2.1.5              
#> [17] tidyr_1.3.1               tibble_3.2.1             
#> [19] ggplot2_3.5.1             tidyverse_2.0.0          
#> [21] BiocStyle_2.35.0         
#> 
#> loaded via a namespace (and not attached):
#>   [1] splines_4.4.1               latex2exp_0.9.6            
#>   [3] polyclip_1.10-7             hardhat_1.4.0              
#>   [5] pROC_1.18.5                 rpart_4.1.23               
#>   [7] lifecycle_1.0.4             rstatix_0.7.2              
#>   [9] doParallel_1.0.17           globals_0.16.3             
#>  [11] lattice_0.22-6              MASS_7.3-61                
#>  [13] MultiAssayExperiment_1.33.0 backports_1.5.0            
#>  [15] magrittr_2.0.3              plotly_4.10.4              
#>  [17] limma_3.63.0                Hmisc_5.2-0                
#>  [19] sass_0.4.9                  rmarkdown_2.28             
#>  [21] jquerylib_0.1.4             yaml_2.3.10                
#>  [23] ClassifyR_3.11.0            buildtools_1.0.0           
#>  [25] abind_1.4-8                 zlibbioc_1.52.0            
#>  [27] GenomicRanges_1.59.0        ggraph_2.2.1               
#>  [29] nnet_7.3-19                 tweenr_2.0.3               
#>  [31] ipred_0.9-15                lava_1.8.0                 
#>  [33] GenomeInfoDbData_1.2.13     IRanges_2.41.0             
#>  [35] KMsurv_0.1-5                S4Vectors_0.44.0           
#>  [37] ggrepel_0.9.6               listenv_0.9.1              
#>  [39] maketools_1.3.1             parallelly_1.38.0          
#>  [41] codetools_0.2-20            DelayedArray_0.33.1        
#>  [43] ggforce_0.4.2               shape_1.4.6.1              
#>  [45] tidyselect_1.2.1            UCSC.utils_1.2.0           
#>  [47] farver_2.1.2                viridis_0.6.5              
#>  [49] matrixStats_1.4.1           stats4_4.4.1               
#>  [51] base64enc_0.1-3             jsonlite_1.8.9             
#>  [53] caret_6.0-94                tidygraph_1.3.1            
#>  [55] Formula_1.2-5               iterators_1.0.14           
#>  [57] foreach_1.5.2               tools_4.4.1                
#>  [59] ggnewscale_0.5.0            Rcpp_1.0.13                
#>  [61] glue_1.8.0                  prodlim_2024.06.25         
#>  [63] gridExtra_2.3               SparseArray_1.6.0          
#>  [65] xfun_0.48                   MatrixGenerics_1.19.0      
#>  [67] ggthemes_5.1.0              GenomeInfoDb_1.43.0        
#>  [69] withr_3.0.2                 BiocManager_1.30.25        
#>  [71] fastmap_1.2.0               fansi_1.0.6                
#>  [73] digest_0.6.37               timechange_0.3.0           
#>  [75] R6_2.5.1                    colorspace_2.1-1           
#>  [77] utf8_1.2.4                  directPA_1.5.1             
#>  [79] calibrate_1.7.7             data.table_1.16.2          
#>  [81] recipes_1.1.0               class_7.3-22               
#>  [83] graphlayouts_1.2.0          httr_1.4.7                 
#>  [85] htmlwidgets_1.6.4           S4Arrays_1.6.0             
#>  [87] ModelMetrics_1.2.2.2        pkgconfig_2.0.3            
#>  [89] gtable_0.3.6                timeDate_4041.110          
#>  [91] XVector_0.46.0              sys_3.4.3                  
#>  [93] survMisc_0.5.6              htmltools_0.5.8.1          
#>  [95] carData_3.0-5               scales_1.3.0               
#>  [97] ggupset_0.4.0               gower_1.0.1                
#>  [99] knitr_1.48                  km.ci_0.5-6                
#> [101] rstudioapi_0.17.1           tzdb_0.4.0                 
#> [103] reshape2_1.4.4              checkmate_2.3.2            
#> [105] nlme_3.1-166                cachem_1.1.0               
#> [107] zoo_1.8-12                  parallel_4.4.1             
#> [109] vipor_0.4.7                 foreign_0.8-87             
#> [111] pillar_1.9.0                grid_4.4.1                 
#> [113] vctrs_0.6.5                 car_3.1-3                  
#> [115] xtable_1.8-4                cluster_2.1.6              
#> [117] beeswarm_0.4.0              htmlTable_2.4.3            
#> [119] evaluate_1.0.1              cli_3.6.3                  
#> [121] compiler_4.4.1              rlang_1.1.4                
#> [123] crayon_1.5.3                future.apply_1.11.3        
#> [125] ggsignif_0.6.4              labeling_0.4.3             
#> [127] plyr_1.8.9                  stringi_1.8.4              
#> [129] viridisLite_0.4.2           BiocParallel_1.41.0        
#> [131] assertthat_0.2.1            munsell_0.5.1              
#> [133] lazyeval_0.2.2              glmnet_4.1-8               
#> [135] Matrix_1.7-1                hms_1.1.3                  
#> [137] future_1.34.0               statmod_1.5.0              
#> [139] highr_0.11                  SummarizedExperiment_1.36.0
#> [141] igraph_2.1.1                broom_1.0.7                
#> [143] memoise_2.0.1               bslib_0.8.0