Predicting New MCIA scores

Predicting MCIA global (factor) scores for new test samples

It may be of interest to use the embedding that is calculated on a training sample set to predict scores on a test set (or, equivalently, on new data).

After loading the nipalsMCIA library, we randomly split the NCI60 cancer cell line data into training and test sets.

Installation

# devel version

# install.packages("devtools")
devtools::install_github("Muunraker/nipalsMCIA", ref = "devel",
                         force = TRUE, build_vignettes = TRUE) # devel version
# release version
if (!require("BiocManager", quietly = TRUE))
  install.packages("BiocManager")

BiocManager::install("nipalsMCIA")
library(ggplot2)
library(MultiAssayExperiment)
library(nipalsMCIA)

Split the data

data(NCI60)
set.seed(8)

num_samples <- dim(data_blocks[[1]])[1]
num_train <- round(num_samples * 0.7, 0)
train_samples <- sample.int(num_samples, num_train)

data_blocks_train <- data_blocks
data_blocks_test <- data_blocks

for (i in seq_along(data_blocks)) {
  data_blocks_train[[i]] <- data_blocks_train[[i]][train_samples, ]
  data_blocks_test[[i]] <- data_blocks_test[[i]][-train_samples, ]
}

# Split corresponding metadata
metadata_train <- data.frame(metadata_NCI60[train_samples, ],
                             row.names = rownames(data_blocks_train$mrna))
colnames(metadata_train) <- c("cancerType")

metadata_test <- data.frame(metadata_NCI60[-train_samples, ],
                            row.names = rownames(data_blocks_test$mrna))
colnames(metadata_test) <- c("cancerType")

# Create train and test mae objects
data_blocks_train_mae <- simple_mae(data_blocks_train, row_format = "sample",
                                    colData = metadata_train)
data_blocks_test_mae <- simple_mae(data_blocks_test, row_format = "sample",
                                   colData = metadata_test)

Run nipalsMCIA on training data

MCIA_train <- nipals_multiblock(data_blocks = data_blocks_train_mae,
                                col_preproc_method = "colprofile", num_PCs = 10,
                                plots = "none", tol = 1e-9)

Visualize model on training data using metadata on cancer type

The get_metadata_colors() function returns an assignment of a color for the metadata columns. The nmb_get_gs() function returns the global scores from the input NipalsResult object.

meta_colors <- get_metadata_colors(mcia_results = MCIA_train, color_col = 1,
                                   color_pal_params = list(option = "E"))

global_scores <- nmb_get_gs(MCIA_train)
MCIA_out <- data.frame(global_scores[, 1:2])
MCIA_out$cancerType <- nmb_get_metadata(MCIA_train)$cancerType
colnames(MCIA_out) <- c("Factor.1", "Factor.2", "cancerType")

# plot the results
ggplot(data = MCIA_out, aes(x = Factor.1, y = Factor.2, color = cancerType)) +
  geom_point(size = 3) +
  labs(title = "MCIA for NCI60 training data") +
  scale_color_manual(values = meta_colors) +
  theme_bw()

Generate factor scores for test data using the MCIA_train model

We use the function to generate new factor scores on the test data set using the MCIA_train model. The new dataset in the form of an MAE object is input using the parameter test_data.

MCIA_test_scores <- predict_gs(mcia_results = MCIA_train,
                               test_data = data_blocks_test_mae)

Visualize new scores with old

We once again plot the top two factor scores for both the training and test datasets

MCIA_out_test <- data.frame(MCIA_test_scores[, 1:2])
MCIA_out_test$cancerType <-
  MultiAssayExperiment::colData(data_blocks_test_mae)$cancerType

colnames(MCIA_out_test) <- c("Factor.1", "Factor.2", "cancerType")
MCIA_out_test$set <- "test"
MCIA_out$set <- "train"
MCIA_out_full <- rbind(MCIA_out, MCIA_out_test)
rownames(MCIA_out_full) <- NULL

# plot the results
ggplot(data = MCIA_out_full,
       aes(x = Factor.1, y = Factor.2, color = cancerType, shape = set)) +
  geom_point(size = 3) +
  labs(title = "MCIA for NCI60 training and test data") +
  scale_color_manual(values = meta_colors) +
  theme_bw()

Session Info

Session Info
sessionInfo()
## R version 4.4.1 (2024-06-14)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 24.04.1 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so;  LAPACK version 3.12.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=C              
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Etc/UTC
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats4    grid      stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] MultiAssayExperiment_1.31.5 SummarizedExperiment_1.35.3
##  [3] Biobase_2.65.1              GenomicRanges_1.57.1       
##  [5] GenomeInfoDb_1.41.2         IRanges_2.39.2             
##  [7] S4Vectors_0.43.2            BiocGenerics_0.51.3        
##  [9] MatrixGenerics_1.17.0       matrixStats_1.4.1          
## [11] stringr_1.5.1               nipalsMCIA_1.3.1           
## [13] ggpubr_0.6.0                ggplot2_3.5.1              
## [15] fgsea_1.31.4                dplyr_1.1.4                
## [17] ComplexHeatmap_2.21.1       BiocStyle_2.33.1           
## 
## loaded via a namespace (and not attached):
##  [1] rlang_1.1.4             magrittr_2.0.3          clue_0.3-65            
##  [4] GetoptLong_1.0.5        compiler_4.4.1          png_0.1-8              
##  [7] vctrs_0.6.5             pkgconfig_2.0.3         shape_1.4.6.1          
## [10] crayon_1.5.3            fastmap_1.2.0           backports_1.5.0        
## [13] XVector_0.45.0          labeling_0.4.3          utf8_1.2.4             
## [16] rmarkdown_2.28          pracma_2.4.4            UCSC.utils_1.1.0       
## [19] purrr_1.0.2             xfun_0.48               zlibbioc_1.51.1        
## [22] cachem_1.1.0            jsonlite_1.8.9          highr_0.11             
## [25] DelayedArray_0.31.14    BiocParallel_1.39.0     broom_1.0.7            
## [28] parallel_4.4.1          cluster_2.1.6           R6_2.5.1               
## [31] stringi_1.8.4           bslib_0.8.0             RColorBrewer_1.1-3     
## [34] car_3.1-3               jquerylib_0.1.4         Rcpp_1.0.13            
## [37] iterators_1.0.14        knitr_1.48              BiocBaseUtils_1.7.3    
## [40] Matrix_1.7-0            tidyselect_1.2.1        abind_1.4-8            
## [43] yaml_2.3.10             doParallel_1.0.17       codetools_0.2-20       
## [46] lattice_0.22-6          tibble_3.2.1            withr_3.0.1            
## [49] evaluate_1.0.0          circlize_0.4.16         pillar_1.9.0           
## [52] BiocManager_1.30.25     carData_3.0-5           foreach_1.5.2          
## [55] generics_0.1.3          munsell_0.5.1           scales_1.3.0           
## [58] glue_1.8.0              maketools_1.3.0         tools_4.4.1            
## [61] sys_3.4.3               data.table_1.16.0       RSpectra_0.16-2        
## [64] ggsignif_0.6.4          buildtools_1.0.0        fastmatch_1.1-4        
## [67] cowplot_1.1.3           tidyr_1.3.1             colorspace_2.1-1       
## [70] GenomeInfoDbData_1.2.13 Formula_1.2-5           cli_3.6.3              
## [73] fansi_1.0.6             viridisLite_0.4.2       S4Arrays_1.5.10        
## [76] gtable_0.3.5            rstatix_0.7.2           sass_0.4.9             
## [79] digest_0.6.37           SparseArray_1.5.44      farver_2.1.2           
## [82] rjson_0.2.23            htmltools_0.5.8.1       lifecycle_1.0.4        
## [85] httr_1.4.7              GlobalOptions_0.1.2