library(dplyr)
library(Matrix)
library(ggplot2)
library(forcats)
library(parallel)
library(reshape2)
library(survival)
library(VennDiagram)
library(futile.logger)
library(curatedTCGAData)
library(MultiAssayExperiment)
library(TCGAutils)
#
library(glmSparseNet)
#
#
# Some general options for futile.logger the debugging package
flog.layout(layout.format("[~l] ~m"))
options(
"glmSparseNet.show_message" = FALSE,
"glmSparseNet.base_dir" = withr::local_tempdir()
)
# Setting ggplot2 default theme as minimal
theme_set(ggplot2::theme_minimal())
This vignette uses the STRING database (https://string-db.org/) of protein-protein interactions as the network-based penalizer in generalized linear models using Breast invasive carcinoma sample dataset.
The degree vector is calculated manually to account for genes that are not present in the STRING database, as these will not have any interactions, i.e. edges.
Retrieve all interactions from STRING databse. We have included a helper function that retrieves the Homo sapiens known interactions.
For this vignette, we use a cached version of all interaction with
score_threshold = 700
Note: Text-based interactions are excluded from the network.
Build a sparse matrix object that contains the network.
## [INFO] Directed graph (score_threshold = 700)
## [INFO] * total edges: 225330
## [INFO] * unique protein: 11033
## [INFO] * edges per protein: 20.423276
## [INFO]
## [INFO] Undirected graph (score_threshold = 700)
## [INFO] * total edges: 225209
## [INFO] * unique protein: 11033
## [INFO] * edges per protein: 20.412309
## [INFO] Summary of degree:
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 3.00 13.00 40.83 43.00 4175.00
glmSparseNet
brca <- curatedTCGAData(
diseaseCode = "BRCA", assays = "RNASeq2GeneNorm",
version = "1.1.38", dry.run = FALSE
)
Build the survival data from the clinical columns.
xdata
and
ydata
# keep only solid tumour (code: 01)
brcaPrimarySolidTumor <- TCGAutils::TCGAsplitAssays(brca, "01")
xdataRaw <- t(assay(brcaPrimarySolidTumor[[1]]))
# Get survival information
ydataRaw <- colData(brcaPrimarySolidTumor) |>
as.data.frame() |>
# Convert days to integer
dplyr::mutate(
Days.to.date.of.Death = as.integer(Days.to.date.of.Death),
Days.to.Last.Contact = as.integer(Days.to.Date.of.Last.Contact)
) |>
# Find max time between all days (ignoring missings)
dplyr::rowwise() |>
dplyr::mutate(
time = max(days_to_last_followup, Days.to.date.of.Death,
Days.to.Last.Contact, days_to_death,
na.rm = TRUE
)
) |>
# Keep only survival variables and codes
dplyr::select(patientID, status = vital_status, time) |>
# Discard individuals with survival time less or equal to 0
dplyr::filter(!is.na(time) & time > 0) |>
as.data.frame()
# Set index as the patientID
rownames(ydataRaw) <- ydataRaw$patientID
# keep only features that are in degreeNetworkVector and have
# standard deviation > 0
validFeatures <- colnames(xdataRaw)[
colnames(xdataRaw) %in% names(degreeNetworkVector[degreeNetworkVector > 0])
]
xdataRaw <- xdataRaw[
TCGAbarcode(rownames(xdataRaw)) %in% rownames(ydataRaw), validFeatures
]
xdataRaw <- scale(xdataRaw)
# Order ydata the same as assay
ydataRaw <- ydataRaw[TCGAbarcode(rownames(xdataRaw)), ]
# Using only a subset of genes previously selected to keep this short example.
set.seed(params$seed)
smallSubset <- c(
"AAK1", "ADRB1", "AK7", "ALK", "APOBEC3F", "ARID1B", "BAMBI",
"BRAF", "BTG1", "CACNG8", "CASP12", "CD5", "CDA", "CEP72",
"CPD", "CSF2RB", "CSN3", "DCT", "DLG3", "DLL3", "DPP4",
"DSG1", "EDA2R", "ERP27", "EXD1", "GABBR2", "GADD45A",
"GBP1", "HTR1F", "IFNK", "IRF2", "IYD", "KCNJ11", "KRTAP5-6",
"MAFA", "MAGEB4", "MAP2K6", "MCTS1", "MMP15", "MMP9",
"NFKBIA", "NLRC4", "NT5C1A", "OPN4", "OR13C5", "OR13C8",
"OR2T6", "OR4K2", "OR52E6", "OR5D14", "OR5H1", "OR6C4",
"OR7A17", "OR8J3", "OSBPL1A", "PAK6", "PDE11A", "PELO",
"PGK1", "PIK3CB", "PMAIP1", "POLR2B", "POP1", "PPFIA3",
"PSME1", "PSME2", "PTEN", "PTGES3", "QARS", "RABGAP1",
"RBM3", "RFC3", "RGPD8", "RPGRIP1L", "SAV1", "SDC1", "SDC3",
"SEC16B", "SFPQ", "SFRP5", "SIPA1L1", "SLC2A14", "SLC6A9",
"SPATA5L1", "SPINT1", "STAR", "STXBP5", "SUN3", "TACC2",
"TACR1", "TAGLN2", "THPO", "TNIP1", "TP53", "TRMT2B", "TUBB1",
"VDAC1", "VSIG8", "WNT3A", "WWOX", "XRCC4", "YME1L1",
"ZBTB11", "ZSCAN21"
) |>
sample(size = 50) |>
sort()
# make sure we have 100 genes
smallSubset <- c(smallSubset, sample(colnames(xdataRaw), 51)) |>
unique() |>
sort()
xdata <- xdataRaw[, smallSubset[smallSubset %in% colnames(xdataRaw)]]
ydata <- ydataRaw |>
dplyr::select(time, status) |>
dplyr::filter(!is.na(time) | time < 0)
#
# Add degree 0 to genes not in STRING network
myDegree <- degreeNetworkVector[smallSubset]
myString <- stringNetworkBinary[smallSubset, smallSubset]
Degree distribution for sample set of gene features (in xdata).
Penalizes using the Hub heuristics, see hubHeuristic
function definition for more details.
resultCVHub <- cv.glmHub(xdata,
Surv(ydata$time, ydata$status),
family = "cox",
foldid = foldid,
network = myString,
network.options = networkOptions(minDegree = 0.2)
)
Kaplan-Meier estimator separating individuals by low and high risk (based on model’s coefficients)
## Warning: The `plot.title` argument of `separate2GroupsCox()` is deprecated as of
## glmSparseNet 1.21.0.
## ℹ Please use the `plotTitle` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `legend.outside` argument of `separate2GroupsCox()` is deprecated as of
## glmSparseNet 1.21.0.
## ℹ Please use the `legendOutside` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## $pvalue
## [1] 9.628346e-05
##
## $plot
##
## $km
## Call: survfit(formula = survival::Surv(time, status) ~ group, data = prognosticIndexDf)
##
## n events median 0.95LCL 0.95UCL
## Low risk - 1 983 129 3959 3738 7455
## High risk - 1 97 23 2911 2469 NA
Penalizes using the Orphan heuristics, see
orphanHeuristic
function definition for more details.
resultCVOrphan <- cv.glmOrphan(xdata,
Surv(ydata$time, ydata$status),
family = "cox",
foldid = foldid,
network = myString,
network.options = networkOptions(minDegree = 0.2)
)
Kaplan-Meier estimator separating individuals by low and high risk (based on model’s coefficients)
## $pvalue
## [1] 0
##
## $plot
##
## $km
## Call: survfit(formula = survival::Surv(time, status) ~ group, data = prognosticIndexDf)
##
## n events median 0.95LCL 0.95UCL
## Low risk - 1 540 34 7455 6456 NA
## High risk - 1 540 118 2866 2573 3472
Uses regular glmnet model as simple baseline
## Loaded glmnet 4.1-8
resultCVGlmnet <- cv.glmnet(xdata,
Surv(ydata$time, ydata$status),
family = "cox",
foldid = foldid
)
Kaplan-Meier estimator separating individuals by low and high risk (based on model’s coefficients)
## $pvalue
## [1] 0
##
## $plot
##
## $km
## Call: survfit(formula = survival::Surv(time, status) ~ group, data = prognosticIndexDf)
##
## n events median 0.95LCL 0.95UCL
## Low risk - 1 540 38 6593 4398 NA
## High risk - 1 540 114 2854 2551 3461
Venn diagram of overlapping genes.
Descriptive table showing which genes are selected in each model
We can observe, that elastic net without network-based penalization selects the best model with 40% more genes than glmOrphan and glmHub, without loosing accuracy.
note: size of circles represent the degree of that gene in network.
## R version 4.4.2 (2024-10-31)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 24.04.1 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so; LAPACK version 3.12.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=C
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Etc/UTC
## tzcode source: system (glibc)
##
## attached base packages:
## [1] grid parallel stats4 stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] glmnet_4.1-8 VennDiagram_1.7.3
## [3] reshape2_1.4.4 forcats_1.0.0
## [5] Matrix_1.7-1 glmSparseNet_1.25.0
## [7] TCGAutils_1.27.0 curatedTCGAData_1.28.1
## [9] MultiAssayExperiment_1.33.1 SummarizedExperiment_1.37.0
## [11] Biobase_2.67.0 GenomicRanges_1.59.1
## [13] GenomeInfoDb_1.43.2 IRanges_2.41.1
## [15] S4Vectors_0.45.2 BiocGenerics_0.53.3
## [17] generics_0.1.3 MatrixGenerics_1.19.0
## [19] matrixStats_1.4.1 futile.logger_1.4.3
## [21] survival_3.7-0 ggplot2_3.5.1
## [23] dplyr_1.1.4 BiocStyle_2.35.0
##
## loaded via a namespace (and not attached):
## [1] sys_3.4.3 jsonlite_1.8.9
## [3] shape_1.4.6.1 magrittr_2.0.3
## [5] GenomicFeatures_1.59.1 farver_2.1.2
## [7] rmarkdown_2.29 BiocIO_1.17.1
## [9] zlibbioc_1.52.0 vctrs_0.6.5
## [11] memoise_2.0.1 Rsamtools_2.23.1
## [13] RCurl_1.98-1.16 rstatix_0.7.2
## [15] htmltools_0.5.8.1 S4Arrays_1.7.1
## [17] BiocBaseUtils_1.9.0 progress_1.2.3
## [19] AnnotationHub_3.15.0 lambda.r_1.2.4
## [21] curl_6.0.1 broom_1.0.7
## [23] Formula_1.2-5 pROC_1.18.5
## [25] SparseArray_1.7.2 sass_0.4.9
## [27] bslib_0.8.0 plyr_1.8.9
## [29] httr2_1.0.7 zoo_1.8-12
## [31] futile.options_1.0.1 cachem_1.1.0
## [33] buildtools_1.0.0 GenomicAlignments_1.43.0
## [35] mime_0.12 lifecycle_1.0.4
## [37] iterators_1.0.14 pkgconfig_2.0.3
## [39] R6_2.5.1 fastmap_1.2.0
## [41] GenomeInfoDbData_1.2.13 digest_0.6.37
## [43] colorspace_2.1-1 AnnotationDbi_1.69.0
## [45] ExperimentHub_2.15.0 RSQLite_2.3.8
## [47] ggpubr_0.6.0 filelock_1.0.3
## [49] labeling_0.4.3 km.ci_0.5-6
## [51] fansi_1.0.6 httr_1.4.7
## [53] abind_1.4-8 compiler_4.4.2
## [55] bit64_4.5.2 withr_3.0.2
## [57] backports_1.5.0 BiocParallel_1.41.0
## [59] carData_3.0-5 DBI_1.2.3
## [61] ggsignif_0.6.4 biomaRt_2.63.0
## [63] rappdirs_0.3.3 DelayedArray_0.33.2
## [65] rjson_0.2.23 tools_4.4.2
## [67] glue_1.8.0 restfulr_0.0.15
## [69] checkmate_2.3.2 gtable_0.3.6
## [71] KMsurv_0.1-5 tzdb_0.4.0
## [73] tidyr_1.3.1 survminer_0.5.0
## [75] data.table_1.16.2 hms_1.1.3
## [77] car_3.1-3 xml2_1.3.6
## [79] utf8_1.2.4 XVector_0.47.0
## [81] BiocVersion_3.21.1 foreach_1.5.2
## [83] pillar_1.9.0 stringr_1.5.1
## [85] splines_4.4.2 BiocFileCache_2.15.0
## [87] lattice_0.22-6 rtracklayer_1.67.0
## [89] bit_4.5.0 tidyselect_1.2.1
## [91] maketools_1.3.1 Biostrings_2.75.1
## [93] knitr_1.49 gridExtra_2.3
## [95] xfun_0.49 stringi_1.8.4
## [97] UCSC.utils_1.3.0 yaml_2.3.10
## [99] evaluate_1.0.1 codetools_0.2-20
## [101] tibble_3.2.1 BiocManager_1.30.25
## [103] cli_3.6.3 xtable_1.8-4
## [105] munsell_0.5.1 jquerylib_0.1.4
## [107] survMisc_0.5.6 Rcpp_1.0.13-1
## [109] GenomicDataCommons_1.31.0 dbplyr_2.5.0
## [111] png_0.1-8 XML_3.99-0.17
## [113] readr_2.1.5 blob_1.2.4
## [115] prettyunits_1.2.0 bitops_1.0-9
## [117] scales_1.3.0 purrr_1.0.2
## [119] crayon_1.5.3 rlang_1.1.4
## [121] KEGGREST_1.47.0 rvest_1.0.4
## [123] formatR_1.14