This tutorial demonstrates how to use GeomxTools for preprocessing protein or proteogenomics data.
Data processing is very similar to what is shown in the Developer_Introduction_to_the_NanoStringGeoMxSet and GeoMx Workflow vignettes with a couple of protein specific functions.
GeoMxSet objects can only read in one analyte at a time. With protein or proteogenomics data, the desired analyte must be added to the call to read in the object. RNA is the default analyte.
datadir <- system.file("extdata","DSP_Proteogenomics_Example_Data",
package = "GeomxTools")
DCCFiles <- unzip(zipfile = file.path(datadir, "/DCCs.zip"))
PKCFiles <- unzip(zipfile = file.path(datadir, "/pkcs.zip"))
SampleAnnotationFile <- file.path(datadir, "Annotation.xlsx")
RNAData <- suppressWarnings(readNanoStringGeoMxSet(dccFiles = DCCFiles,
pkcFiles = PKCFiles,
phenoDataFile = SampleAnnotationFile,
phenoDataSheet = "Annotations",
phenoDataDccColName = "Sample_ID",
protocolDataColNames = c("Tissue",
"Segment_Type",
"ROI.Size"),
configFile = NULL,
analyte = "RNA",
phenoDataColPrefix = "",
experimentDataColNames = NULL))
proteinData <- suppressWarnings(readNanoStringGeoMxSet(dccFiles = DCCFiles,
pkcFiles = PKCFiles,
phenoDataFile = SampleAnnotationFile,
phenoDataSheet = "Annotations",
phenoDataDccColName = "Sample_ID",
protocolDataColNames = c("Tissue",
"Segment_Type",
"ROI.Size"),
configFile = NULL,
analyte = "protein",
phenoDataColPrefix = "",
experimentDataColNames = NULL))
RNAData <- aggregateCounts(RNAData)
RNAData
## NanoStringGeoMxSet (storageMode: lockedEnvironment)
## assayData: 18677 features, 84 samples
## element names: exprs
## protocolData
## sampleNames: DSP-1001900002618-G-A02.dcc DSP-1001900002618-G-A03.dcc
## ... DSP-1001900002618-G-H01.dcc (84 total)
## varLabels: FileVersion SoftwareVersion ... ROI.Size (17 total)
## varMetadata: labelDescription
## phenoData
## sampleNames: DSP-1001900002618-G-A02.dcc DSP-1001900002618-G-A03.dcc
## ... DSP-1001900002618-G-H01.dcc (84 total)
## varLabels: Plate Well ... NegGeoSD_Hs_R_NGS_WTA_v1.0 (13 total)
## varMetadata: labelDescription
## featureData
## featureNames: A2M NAT2 ... CST2 (18677 total)
## fvarLabels: TargetName Module ... Negative (6 total)
## fvarMetadata: labelDescription
## experimentData: use 'experimentData(object)'
## Annotation: Hs_R_NGS_WTA_v1.0.pkc
## signature: none
## feature: Target
## analyte: RNA
## NanoStringGeoMxSet (storageMode: lockedEnvironment)
## assayData: 147 features, 84 samples
## element names: exprs
## protocolData
## sampleNames: DSP-1001900002618-G-A02.dcc DSP-1001900002618-G-A03.dcc
## ... DSP-1001900002618-G-H01.dcc (84 total)
## varLabels: FileVersion SoftwareVersion ... ROI.Size (17 total)
## varMetadata: labelDescription
## phenoData
## sampleNames: DSP-1001900002618-G-A02.dcc DSP-1001900002618-G-A03.dcc
## ... DSP-1001900002618-G-H01.dcc (84 total)
## varLabels: Plate Well ... Y (11 total)
## varMetadata: labelDescription
## featureData
## featureNames: Ms IgG1 CD45 ... ADAM10 (147 total)
## fvarLabels: RTS_ID TargetName ... Negative (8 total)
## fvarMetadata: labelDescription
## experimentData: use 'experimentData(object)'
## Annotation: Hs_P_NGS_ADPath_Ext_v1.0.pkc Hs_P_NGS_ADPath_v1.0.pkc Hs_P_NGS_Autophagy_v1.0.pkc Hs_P_NGS_CellDeath_v1.0.pkc Hs_P_NGS_Core_v1.0.pkc Hs_P_NGS_GlialSubtype_v1.0.pkc Hs_P_NGS_IODrugTarget_v1.0.pkc Hs_P_NGS_ImmuneActivation_v1.0.pkc Hs_P_NGS_ImmuneCellTyping_v1.0.pkc Hs_P_NGS_MAPK_v1.1.pkc Hs_P_NGS_Myeloid_v1.0.pkc Hs_P_NGS_NeuralCellTyping_v1.0.pkc Hs_P_NGS_PDPath_v1.0.pkc Hs_P_NGS_PI3K_AKT_v1.0.pkc Hs_P_NGS_PanTumor_v1.0.pkc
## signature: none
## feature: Target
## analyte: Protein
By having the datasets split by analyte, each object can go through the typical QC and normalization steps specific to that analyte.
For RNA please refer to the introduction or GeoMx Workflow vignettes.
After reading in the object, we will do one QC step: flag and remove low quality ROIs
proteinData <- setSegmentQCFlags(proteinData, qcCutoffs = list(percentSaturation = 45,
minSegmentReads=1000,
percentAligned=80,
minNegativeCount=10,
maxNTCCount=60,
minNuclei=16000,
minArea=20))
# low sequenced ROIs
lowSaturation <- which(as.data.frame(protocolData(proteinData)[["QCFlags"]])["LowSaturation"] == TRUE)
# remove low quality ROIs
passedQC <- proteinData[, -lowSaturation]
dim(proteinData)
## Features Samples
## 147 84
## Features Samples
## 147 82
Housekeepers and negative controls (IgGs) can easily be pulled out of the dataset.
## [1] "Histone H3" "GAPDH" "S6"
## [1] "Ms IgG1" "Ms IgG2a" "Rb IgG"
For the target QC step, we identify proteins with potentially little useful signal using this figure.
fig <- qcProteinSignal(object = proteinData, neg.names = igg.names)
proteinOrder <- qcProteinSignalNames(object = proteinData, neg.names = igg.names)
genesOfInterest <- c(which(proteinOrder == "Tyrosine Hydroxylase"),
which(proteinOrder == "ApoA-I"),
which(proteinOrder == "EpCAM"))
fig()
rect(xleft = 0, xright = 4,
ybottom = -2, ytop = 2, density = 0, col = "#1B9E77", lwd = 2)
rect(xleft = genesOfInterest[1]-1, xright = genesOfInterest[1]+1,
ybottom = -2, ytop = 1.25, density = 0, col = "#D95F02", lwd = 2)
rect(xleft = genesOfInterest[2]-1, xright = genesOfInterest[2]+1,
ybottom = -1, ytop = 3, density = 0, col = "#66A61E", lwd = 2)
rect(xleft = genesOfInterest[3]-1, xright = genesOfInterest[3]+1,
ybottom = -3, ytop = 6.5, density = 0, col = "#E7298A", lwd = 2)
The highlighted proteins may require further investigation after differential expression analysis but can typically be kept in the study.
proteinOrder <- qcProteinSignalNames(object = proteinData, neg.names = igg.names)
P62 <- which(proteinOrder == "P62")
fig()
rect(xleft = 3.5, xright = P62, ybottom = -6, ytop = 10, density = 2, col = "red", lty = 3)
However, here is example code if you choose to remove them.
In bulk:
proteinOrder <- qcProteinSignalNames(object = proteinData, neg.names = igg.names)
length(proteinOrder)
P62 <- which(proteinOrder == "P62")
fig()
rect(xleft = 3.5, xright = P62, ybottom = -6, ytop = 10, density = 2, col = "red", lty = 3)
#Right most protein where all proteins to the left will get removed
#start after the IgG targets
proteinOrder <- proteinOrder[-c((length(igg.names)+1):P62)]
length(proteinOrder)
#replot with fewer targets
fig <- qcProteinSignal(object = proteinData[proteinOrder,], neg.names = igg.names)
fig()
Or by specific proteins:
proteinOrder <- qcProteinSignalNames(object = proteinData[proteinOrder,], neg.names = igg.names)
#which proteins to remove from analysis
lowTargets <- c("pan-RAS", "Neprilysin", "Olig2", "P2ry12", "p53", "NY-ESO-1", "INPP4B", "CD31", "Phospho-Alpha-synuclein (S129)", "Bcl-2")
proteinOrder <- proteinOrder[-c(which(proteinOrder %in% lowTargets))]
length(proteinOrder)
fig <- qcProteinSignal(object = proteinData[proteinOrder,], neg.names = igg.names)
fig()
For more information on protein normalization please refer to our whitepaper.
After filtering targets, we move onto normalization. There are many types of normalization and we have two built in figure types to help decide what is the best method for the dataset.
The first is a concordance plot of a list of targets, normally the IgGs or HK, colored by ROI factors like tissue or segment type. The upper panels are the concordance plots and the lower panels are the standard deviation of the log2-ratios between the targets. This figure does not show correlations because that calculation is increased with the large range that these values can take (198-165497 in this example). SD(log2(ratios)) measures essentially the same thing but is invariant to that range. However the metrics are inversed, high correlation = low SDs.
Our motivating theory is simple: if several targets all accurately measure signal strength, they should be highly correlated with each other. More precisely, the log-ratios between them should have low SDs.
Above we see good concordance amongst the IgGs, confirming they all can be used. Numbers in the top-right panels show the SD of the log2-ratios between IgGs. Importantly, we do not see a tendency for one IgG to be offset from the others, suggesting there’s no between-slide bias in calculation of background.
The second plot helps show the concordance of normalization factors. The factors are calculated on the IgG and HK targets and the area or nuclei count if provided. The lower panels are the concordance plots and the upper panels are the standard deviation of the log2-ratios between the normalization factors.
normfactors <- computeNormalizationFactors(object = proteinData,
area = "AOI.Size.um2",
nuclei = "Nuclei.Counts")
plotNormFactorConcordance(object = proteinData, plotFactor = "Tissue",
normfactors = normfactors)
From this plot we can conclude that:
This divergence of area and nuclei vs IgGs and HKs is common which is why Background or HK normalization is recommended. The area and nuclei plots are good QC metrics to look for outliers or additionally can help you potentially ID some preferential bias in a study design.
After choosing a normalization technique from these plots, we normalize the data. Area and nuclei normalization are not native functions in GeomxTools, if you decide on normalizing by those factors you will need to do that separately. Quantile normalization is also available if HK or background normalization are not preferred.
#HK normalization
proteinData <- normalize(proteinData, norm_method="hk", toElt = "hk_norm")
#Background normalization
proteinData <- normalize(proteinData, norm_method="neg", toElt = "neg_norm")
#Quantile normalization
proteinData <- normalize(proteinData, norm_method="quant", desiredQuantile = .75, toElt = "q_norm")
names(proteinData@assayData)
## [1] "neg_norm" "q_norm" "exprs" "hk_norm"
This dataset is now ready for downstream analysis.
## R version 4.4.1 (2024-06-14)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 24.04.1 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so; LAPACK version 3.12.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=C
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Etc/UTC
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] SpatialExperiment_1.15.1 SingleCellExperiment_1.27.2
## [3] SummarizedExperiment_1.35.5 GenomicRanges_1.57.2
## [5] GenomeInfoDb_1.41.2 IRanges_2.39.2
## [7] MatrixGenerics_1.17.1 matrixStats_1.4.1
## [9] patchwork_1.3.0 SpatialDecon_1.15.0
## [11] Seurat_5.1.0 SeuratObject_5.0.2
## [13] sp_2.1-4 ggiraph_0.8.10
## [15] EnvStats_3.0.0 GeomxTools_3.11.0
## [17] NanoStringNCTools_1.13.0 ggplot2_3.5.1
## [19] S4Vectors_0.43.2 Biobase_2.67.0
## [21] BiocGenerics_0.53.0 rmarkdown_2.28
##
## loaded via a namespace (and not attached):
## [1] RcppAnnoy_0.0.22 splines_4.4.1 later_1.3.2
## [4] R.oo_1.26.0 tibble_3.2.1 cellranger_1.1.0
## [7] polyclip_1.10-7 fastDummies_1.7.4 lifecycle_1.0.4
## [10] globals_0.16.3 lattice_0.22-6 MASS_7.3-61
## [13] magrittr_2.0.3 plotly_4.10.4 sass_0.4.9
## [16] jquerylib_0.1.4 yaml_2.3.10 httpuv_1.6.15
## [19] sctransform_0.4.1 spam_2.11-0 spatstat.sparse_3.1-0
## [22] reticulate_1.39.0 cowplot_1.1.3 pbapply_1.7-2
## [25] buildtools_1.0.0 minqa_1.2.8 RColorBrewer_1.1-3
## [28] abind_1.4-8 zlibbioc_1.51.2 R.cache_0.16.0
## [31] Rtsne_0.17 R.utils_2.12.3 purrr_1.0.2
## [34] GenomeInfoDbData_1.2.13 ggrepel_0.9.6 irlba_2.3.5.1
## [37] listenv_0.9.1 spatstat.utils_3.1-0 maketools_1.3.1
## [40] pheatmap_1.0.12 goftest_1.2-3 RSpectra_0.16-2
## [43] spatstat.random_3.3-2 fitdistrplus_1.2-1 parallelly_1.38.0
## [46] DelayedArray_0.31.14 leiden_0.4.3.1 codetools_0.2-20
## [49] tidyselect_1.2.1 UCSC.utils_1.1.0 farver_2.1.2
## [52] lme4_1.1-35.5 spatstat.explore_3.3-3 jsonlite_1.8.9
## [55] progressr_0.15.0 ggridges_0.5.6 survival_3.7-0
## [58] systemfonts_1.1.0 tools_4.4.1 ica_1.0-3
## [61] Rcpp_1.0.13 glue_1.8.0 SparseArray_1.5.45
## [64] gridExtra_2.3 mgcv_1.9-1 xfun_0.48
## [67] ggthemes_5.1.0 dplyr_1.1.4 withr_3.0.2
## [70] numDeriv_2016.8-1.1 fastmap_1.2.0 GGally_2.2.1
## [73] repmis_0.5 boot_1.3-31 fansi_1.0.6
## [76] digest_0.6.37 R6_2.5.1 mime_0.12
## [79] colorspace_2.1-1 scattermore_1.2 tensor_1.5
## [82] spatstat.data_3.1-2 R.methodsS3_1.8.2 utf8_1.2.4
## [85] tidyr_1.3.1 generics_0.1.3 data.table_1.16.2
## [88] S4Arrays_1.5.11 httr_1.4.7 htmlwidgets_1.6.4
## [91] ggstats_0.7.0 uwot_0.2.2 pkgconfig_2.0.3
## [94] gtable_0.3.6 lmtest_0.9-40 XVector_0.45.0
## [97] sys_3.4.3 htmltools_0.5.8.1 dotCall64_1.2
## [100] scales_1.3.0 png_0.1-8 logNormReg_0.5-0
## [103] spatstat.univar_3.0-1 knitr_1.48 reshape2_1.4.4
## [106] rjson_0.2.23 uuid_1.2-1 nlme_3.1-166
## [109] nloptr_2.1.1 cachem_1.1.0 zoo_1.8-12
## [112] stringr_1.5.1 KernSmooth_2.23-24 parallel_4.4.1
## [115] miniUI_0.1.1.1 vipor_0.4.7 pillar_1.9.0
## [118] grid_4.4.1 vctrs_0.6.5 RANN_2.6.2
## [121] promises_1.3.0 xtable_1.8-4 cluster_2.1.6
## [124] beeswarm_0.4.0 evaluate_1.0.1 magick_2.8.5
## [127] cli_3.6.3 compiler_4.4.1 rlang_1.1.4
## [130] crayon_1.5.3 future.apply_1.11.3 labeling_0.4.3
## [133] plyr_1.8.9 ggbeeswarm_0.7.2 stringi_1.8.4
## [136] viridisLite_0.4.2 deldir_2.0-4 lmerTest_3.1-3
## [139] munsell_0.5.1 Biostrings_2.75.0 lazyeval_0.2.2
## [142] spatstat.geom_3.3-3 Matrix_1.7-1 RcppHNSW_0.6.0
## [145] future_1.34.0 shiny_1.9.1 highr_0.11
## [148] ROCR_1.0-11 igraph_2.1.1 bslib_0.8.0
## [151] readxl_1.4.3