Title: | SubCellBarCode: Integrated workflow for robust mapping and visualizing whole human spatial proteome |
---|---|
Description: | Mass-Spectrometry based spatial proteomics have enabled the proteome-wide mapping of protein subcellular localization (Orre et al. 2019, Molecular Cell). SubCellBarCode R package robustly classifies proteins into corresponding subcellular localization. |
Authors: | Taner Arslan |
Maintainer: | Taner Arslan <[email protected]> |
License: | GPL-2 |
Version: | 1.23.0 |
Built: | 2024-10-31 05:36:25 UTC |
Source: | https://github.com/bioc/SubCellBarCode |
Apply thresholds for all predictions to increase the true positive rate and remove poor classification.
applyThresholdCompartment(all.repA, all.repB, threshold.df)
applyThresholdCompartment(all.repA, all.repB, threshold.df)
all.repA |
data.frame; all predictions and probablity vectors for each protein in replicate A |
all.repB |
data.frame; all predictions and probablity vectors for each protein in replicate B |
threshold.df |
data.frame; collection od precision and recall values for each compaartment |
c.cls.df
{ df <- loadData(SubCellBarCode::hcc827Ctrl) c.prots <- calculateCoveredProtein(rownames(df), markerProteins[,1]) set.seed(7) c.prots <- sample(c.prots, 550) cls <- svmClassification(c.prots, df, markerProteins) test.A <- cls[[1]]$svm.test.prob.out test.B <- cls[[2]]$svm.test.prob.out t.c.df <- computeThresholdCompartment(test.A, test.B) all.A <- cls[[1]]$all.prot.pred all.B <- cls[[2]]$all.prot.pred c.cls.df <- applyThresholdCompartment(all.A, all.B, t.c.df) }
{ df <- loadData(SubCellBarCode::hcc827Ctrl) c.prots <- calculateCoveredProtein(rownames(df), markerProteins[,1]) set.seed(7) c.prots <- sample(c.prots, 550) cls <- svmClassification(c.prots, df, markerProteins) test.A <- cls[[1]]$svm.test.prob.out test.B <- cls[[2]]$svm.test.prob.out t.c.df <- computeThresholdCompartment(test.A, test.B) all.A <- cls[[1]]$all.prot.pred all.B <- cls[[2]]$all.prot.pred c.cls.df <- applyThresholdCompartment(all.A, all.B, t.c.df) }
Apply thresholds for all predictions at the neighborhood level to increase the true positive rate and remove poor classification.
applyThresholdNeighborhood(all.repA, all.repB, threshold.df)
applyThresholdNeighborhood(all.repA, all.repB, threshold.df)
all.repA |
data.frame; all predictions and probablity vectors for each protein in replicate A |
all.repB |
data.frame; all predictions and probablity vectors for each protein in replicate B |
threshold.df |
data.frame; collection od precision and recall values for each neighborhood |
n.cls.df
{ df <- loadData(SubCellBarCode::hcc827Ctrl) c.prots <- calculateCoveredProtein(rownames(df), markerProteins[,1]) set.seed(7) c.prots <- sample(c.prots, 600) cls <- svmClassification(c.prots, df, markerProteins) test.A <- cls[[1]]$svm.test.prob.out test.B <- cls[[2]]$svm.test.prob.out t.n.df <- computeThresholdNeighborhood(test.A, test.B) all.A <- cls[[1]]$all.prot.pred all.B <- cls[[2]]$all.prot.pred n.cls.df <- applyThresholdNeighborhood(all.A, all.B, t.n.df) }
{ df <- loadData(SubCellBarCode::hcc827Ctrl) c.prots <- calculateCoveredProtein(rownames(df), markerProteins[,1]) set.seed(7) c.prots <- sample(c.prots, 600) cls <- svmClassification(c.prots, df, markerProteins) test.A <- cls[[1]]$svm.test.prob.out test.B <- cls[[2]]$svm.test.prob.out t.n.df <- computeThresholdNeighborhood(test.A, test.B) all.A <- cls[[1]]$all.prot.pred all.B <- cls[[2]]$all.prot.pred n.cls.df <- applyThresholdNeighborhood(all.A, all.B, t.n.df) }
Given the proteomics data, number of overlapped marker proteins is calculated. Bar plot for each compartment is plotted.
calculateCoveredProtein(proteinIDs, markerproteins)
calculateCoveredProtein(proteinIDs, markerproteins)
proteinIDs |
character; gene symbol id |
markerproteins |
character; 3365 proteins gene symbol ids |
covered.proteins
{ df <- loadData(SubCellBarCode::hcc827Ctrl) c.prots <- calculateCoveredProtein(rownames(df), markerProteins[,1]) }
{ df <- loadData(SubCellBarCode::hcc827Ctrl) c.prots <- calculateCoveredProtein(rownames(df), markerProteins[,1]) }
Duplicated franctions A and B are summarized by taking their mean for each protein. After taking the mean, the data log2 transformed. Further, the 5 main fractions are used to check correlation between input datas. It is a helper function.
calRowMean(d.df)
calRowMean(d.df)
d.df |
data.frame; A data frame of 10 fraction profiles consisting of replicate A and B. |
r.df
{ r.df <- calRowMean(SubCellBarCode::hcc827Ctrl) }
{ r.df <- calRowMean(SubCellBarCode::hcc827Ctrl) }
Identify candidate condition-dependent relocated proteins by comparing neighborhood classifications with respect to protein-protein pearson correlation and minumum PSM, peptide spectrum matching, count.
candidateRelocatedProteins( sampleCls1, s1PSM, s1Quant, sampleCls2, s2PSM, s2Quant, annotation = FALSE, min.psm = 2, pearson.cor = 0.8 )
candidateRelocatedProteins( sampleCls1, s1PSM, s1Quant, sampleCls2, s2PSM, s2Quant, annotation = FALSE, min.psm = 2, pearson.cor = 0.8 )
sampleCls1 |
data.frame; merged classification, combination of compartment and neighborhood classification. |
s1PSM |
data.frame; minimum PSM count table across ten TMT channel |
s1Quant |
data.frame; fractionation quantification data |
sampleCls2 |
data.frame; merged classification, combination of compartment and neighborhood classification. |
s2PSM |
data.frame; minimum PSM count table across ten TMT channel |
s2Quant |
data.frame; fractionation quantification data |
annotation |
boolean; labeling the selected proteins |
min.psm |
numeric; minimum psm, peptide spectra matching value |
pearson.cor |
numeric; pearson correlation threshold |
candidate.df
{ candidate.df <- candidateRelocatedProteins(hcc827GEFClass, hcc827GefPSMCount, hcc827GEF, hcc827GEFClass, hcc827GefPSMCount, hcc827GEF, annotation = FALSE) }
{ candidate.df <- candidateRelocatedProteins(hcc827GEFClass, hcc827GefPSMCount, hcc827GEF, hcc827GEFClass, hcc827GefPSMCount, hcc827GEF, annotation = FALSE) }
Comparison of the gene centric and exon centric classification. Additionally, correlation analysis is performed using quantification data.
compareCls(geneCls, exonCls)
compareCls(geneCls, exonCls)
geneCls |
data frame gene centric classification output |
exonCls |
data frame exon centric classification output |
c.df
{ exon.cls <- data.frame(Protein = c("ENSE00000331854", "ENSE00000331855", "ENSE00000331859"), NeighborhoodCls = c("Cytosol", "Cytosol", "Cytosol"), CompartmentCls = c("C1","C1","C1"), Secretory = c(0.1, 0.1, 0.1), Nuclear = c(0.2, 0.2, 0.2), Cytosol = c(0.2, 0.2, 0.2), Mitochondria = c(0.2, 0.2, 0.2), S1 = c(0.2, 0.2, 0.2), S2 = c(0.2, 0.2, 0.2), S3 = c(0.2, 0.2, 0.2), S4 = c(0.2, 0.2, 0.2), N1 = c(0.2, 0.2, 0.2), N2 = c(0.2, 0.2, 0.2), N3 = c(0.2, 0.2, 0.2), N4 = c(0.2, 0.2, 0.2), C1 = c(0.2, 0.2, 0.2), C2 = c(0.2, 0.2, 0.2), C3 = c(0.2, 0.2, 0.2), C4 = c(0.2, 0.2, 0.2), C5 = c(0.2, 0.2, 0.2), M1 = c(0.2, 0.2, 0.2), M2 = c(0.2, 0.2, 0.2), GeneSymbol = c("COPB1", "COPB1", "COPB1"), PeptideCount = c(2, 4, 7)) gene.cls <- data.frame(Protein = c("COPB1"), NeighborhoodCls = c("Cytosol"), CompartmentCls = c("C1"), Secretory = c(0.1), Nuclear = c(0.2), Cytosol = c(0.2), Mitochondria = c(0.2), S1 = c(0.2), S2 = c(0.2), S3 = c(0.2), S4 = c(0.2), N1 = c(0.2), N2 = c(0.2), N3 = c(0.2), N4 = c(0.2), C1 = c(0.2), C2 = c(0.2), C3 = c(0.2), C4 = c(0.2), C5 = c(0.2), M1 = c(0.2), M2 = c(0.2)) comp.df <- compareCls(gene.cls, exon.cls) }
{ exon.cls <- data.frame(Protein = c("ENSE00000331854", "ENSE00000331855", "ENSE00000331859"), NeighborhoodCls = c("Cytosol", "Cytosol", "Cytosol"), CompartmentCls = c("C1","C1","C1"), Secretory = c(0.1, 0.1, 0.1), Nuclear = c(0.2, 0.2, 0.2), Cytosol = c(0.2, 0.2, 0.2), Mitochondria = c(0.2, 0.2, 0.2), S1 = c(0.2, 0.2, 0.2), S2 = c(0.2, 0.2, 0.2), S3 = c(0.2, 0.2, 0.2), S4 = c(0.2, 0.2, 0.2), N1 = c(0.2, 0.2, 0.2), N2 = c(0.2, 0.2, 0.2), N3 = c(0.2, 0.2, 0.2), N4 = c(0.2, 0.2, 0.2), C1 = c(0.2, 0.2, 0.2), C2 = c(0.2, 0.2, 0.2), C3 = c(0.2, 0.2, 0.2), C4 = c(0.2, 0.2, 0.2), C5 = c(0.2, 0.2, 0.2), M1 = c(0.2, 0.2, 0.2), M2 = c(0.2, 0.2, 0.2), GeneSymbol = c("COPB1", "COPB1", "COPB1"), PeptideCount = c(2, 4, 7)) gene.cls <- data.frame(Protein = c("COPB1"), NeighborhoodCls = c("Cytosol"), CompartmentCls = c("C1"), Secretory = c(0.1), Nuclear = c(0.2), Cytosol = c(0.2), Mitochondria = c(0.2), S1 = c(0.2), S2 = c(0.2), S3 = c(0.2), S4 = c(0.2), N1 = c(0.2), N2 = c(0.2), N3 = c(0.2), N4 = c(0.2), C1 = c(0.2), C2 = c(0.2), C3 = c(0.2), C4 = c(0.2), C5 = c(0.2), M1 = c(0.2), M2 = c(0.2)) comp.df <- compareCls(gene.cls, exon.cls) }
Thresholds for each compartment are decided to get confident predictions.
computeThresholdCompartment(test.repA, test.repB)
computeThresholdCompartment(test.repA, test.repB)
test.repA |
data.frame; test predictions, observation and probablity vectors for each protein in replicate A |
test.repB |
data.frame; test predictions, observation and probablity vectors for each protein in replicate B |
threshold.compartment.df
{ df <- loadData(SubCellBarCode::hcc827Ctrl) c.prots <- calculateCoveredProtein(rownames(df), markerProteins[,1]) set.seed(7) c.prots <- sample(c.prots, 550) cls <- svmClassification(c.prots, df, markerProteins) test.A <- cls[[1]]$svm.test.prob.out test.B <- cls[[2]]$svm.test.prob.out t.c.df <- computeThresholdCompartment(test.A, test.B) }
{ df <- loadData(SubCellBarCode::hcc827Ctrl) c.prots <- calculateCoveredProtein(rownames(df), markerProteins[,1]) set.seed(7) c.prots <- sample(c.prots, 550) cls <- svmClassification(c.prots, df, markerProteins) test.A <- cls[[1]]$svm.test.prob.out test.B <- cls[[2]]$svm.test.prob.out t.c.df <- computeThresholdCompartment(test.A, test.B) }
Thresholds for each neighborhood are decided to get confident predictions.
computeThresholdNeighborhood(test.repA, test.repB)
computeThresholdNeighborhood(test.repA, test.repB)
test.repA |
data.frame; test predictions, observation and probablity vectors for each protein in replicate A |
test.repB |
data.frame; test predictions, observation and probablity vectors for each protein in replicate B |
threshold.neighborhood.df
{ df <- loadData(SubCellBarCode::hcc827Ctrl) c.prots <- calculateCoveredProtein(rownames(df), markerProteins[,1]) set.seed(7) c.prots <- sample(c.prots, 600) cls <- svmClassification(c.prots, df, markerProteins) test.A <- cls[[1]]$svm.test.prob.out test.B <- cls[[2]]$svm.test.prob.out t.n.df <- computeThresholdNeighborhood(test.A, test.B) }
{ df <- loadData(SubCellBarCode::hcc827Ctrl) c.prots <- calculateCoveredProtein(rownames(df), markerProteins[,1]) set.seed(7) c.prots <- sample(c.prots, 600) cls <- svmClassification(c.prots, df, markerProteins) test.A <- cls[[1]]$svm.test.prob.out test.B <- cls[[2]]$svm.test.prob.out t.n.df <- computeThresholdNeighborhood(test.A, test.B) }
Identifier for each feature should be converted into gene symbols unless they are not gene symbols
convert2symbol(df, id = "UNIPROT")
convert2symbol(df, id = "UNIPROT")
df |
data.frame; fractionated proteomics data where data contains 10 columns of duplicated 5 fractionations and rownames must be identifier e.g. UNIPROT, Entrez ID |
id |
caharacter; identifier id for each protein |
df
{ df <- data.frame(Uniprot = c("A4D0S4","A8TX70","O00305","O00337"), Organism = rep("Homo Sap.", 4)) rownames(df) <- df$Uniprot }
{ df <- data.frame(Uniprot = c("A4D0S4","A8TX70","O00305","O00337"), Organism = rep("Homo Sap.", 4)) rownames(df) <- df$Uniprot }
Subcellular fractionated cell line.
hcc827Ctrl
hcc827Ctrl
A data frame where 10480 protein gene-centric ids and 5 replicated subcellular fractions.
Orre et al. 2019 Cell 73, 1-17
{ head(hcc827Ctrl) }
{ head(hcc827Ctrl) }
Minimum PSM, Peptide Sequence Match, Count table for HCC827Ctrl Cell Line.
hcc827CtrlPSMCount
hcc827CtrlPSMCount
A data frame where 10480 protein gene-centric ids minimum PSM count.
Orre et al. 2019 Cell 73, 1-17
{ head(hcc827CtrlPSMCount) }
{ head(hcc827CtrlPSMCount) }
Exon-centric sub data of hcc827 fractionated data.
hcc827exon
hcc827exon
A data frame where 500 exon-centric ensemble identifiers, corresponding gene symbols, 5 replicated subcellular fractions and number of unique peptides matched to associated exon.
Orre et al. 2019 Cell 73, 1-17
{ head(hcc827exon) }
{ head(hcc827exon) }
HCC827 cell line was treated with Gefitinib which is EGFR inhibition.
hcc827GEF
hcc827GEF
A data frame where 10398 protein gene-centric ids and 5 replicated subcellular fractions with duplicates.
Orre et al. 2019 Cell 73, 1-17
{ head(hcc827GEF) }
{ head(hcc827GEF) }
Gefitinib treated HCC827 cell line classification contains both neighborhood and compartment level. The data will be used for the relocalization analysis.
hcc827GEFClass
hcc827GEFClass
A data frame where 10398 protein gene-centric ids and corresponding compartment and neighborhood classification alon with classification probabilities.
Orre et al. 2019 Cell 73, 1-17
{ head(hcc827GEFClass) }
{ head(hcc827GEFClass) }
Minimum PSM, Peptide Sequence Match, Count table for HCC827 Gefitinib Cell Line.
hcc827GefPSMCount
hcc827GefPSMCount
A data frame where 10398 protein gene-centric ids minimum PSM count.
Orre et al. 2019 Cell 73, 1-17
{ head(hcc827GefPSMCount) }
{ head(hcc827GefPSMCount) }
Sampled median normalized TMT ratios are checked if there is any "NA" valeus. If any, the corresponding row is filtered out. Later, the data is normalized by taking log2.
loadData(protein.data)
loadData(protein.data)
protein.data |
data.frame; fractionated proteomics data where data contains 10 columns of duplicated 5 fractionations and rownames must be gene-centric protein names |
protein.data.df
{ df <- loadData(SubCellBarCode::hcc827Ctrl[1:20,]) }
{ df <- loadData(SubCellBarCode::hcc827Ctrl[1:20,]) }
Data for the proteins whose localizations were well characterized. It also contains color codes for each compartment and median fractionation profiles for 5 fractions which are Cyto., Nsol., NucI., Horg., Lorg., with replicates A and B. These fractionation profiles will be used for the marker protein quality control.
markerProteins
markerProteins
A data frame of 3365 proteins as rows and 13 columns headers.
Orre et al. 2019 Cell 73, 1-17
Given the proteomics data, quality of the overlapped marker proteins are evaluated by correlating replicates of fractions.
markerQualityControl(coveredProteins, protein.data)
markerQualityControl(coveredProteins, protein.data)
coveredProteins |
character; list of marker proteins, gene symbols, that are covered in 3365 marker proteins. |
protein.data |
data.frame; fractionated proteomics data, rownames are gene symbols associated protein. |
robustMarkers
{ df <- loadData(SubCellBarCode::hcc827Ctrl) c.prots <- calculateCoveredProtein(rownames(df), markerProteins[,1]) r.markers <- markerQualityControl(c.prots[1:5], df) }
{ df <- loadData(SubCellBarCode::hcc827Ctrl) c.prots <- calculateCoveredProtein(rownames(df), markerProteins[,1]) r.markers <- markerQualityControl(c.prots[1:5], df) }
Compartment and neighborhood classifications are merged for the single output.
mergeCls(compartmentCls, neighborhoodCls)
mergeCls(compartmentCls, neighborhoodCls)
compartmentCls |
data.frame; all predictions, including unclassified as well, and probablity vectors for each protein in compartment classification |
neighborhoodCls |
data.frame; all predictions, including unclassified as well, and probablity vectors for each protein in compartment classification |
cls.df
{ #create mock data com.df <- data.frame(Proteins = "TP53", svm.pred = "N1", S1 = as.numeric(0.02), S2 = as.numeric(0.02), S3 = as.numeric(0.02), S4 = as.numeric(0.02), N1 = as.numeric(0.72), N2 = as.numeric(0.02), N3 = as.numeric(0.02), N4 = as.numeric(0.02), C1 = as.numeric(0.02), C2 = as.numeric(0.02), C3 = as.numeric(0.02), C4 = as.numeric(0.02), C5 = as.numeric(0.02), M1 = as.numeric(0.02), M2 = as.numeric(0.02)) rownames(com.df) <- "TP53" neig.df <- data.frame(Proteins = "TP53", svm.pred.all = "Nuclear", Secretory = as.numeric(0.01), Nuclear = as.numeric(0.95), Cytosol = as.numeric(0.02), Mitochondria = as.numeric(0.02)) rownames(neig.df) <- "TP53" cls.df <- mergeCls(com.df, neig.df) }
{ #create mock data com.df <- data.frame(Proteins = "TP53", svm.pred = "N1", S1 = as.numeric(0.02), S2 = as.numeric(0.02), S3 = as.numeric(0.02), S4 = as.numeric(0.02), N1 = as.numeric(0.72), N2 = as.numeric(0.02), N3 = as.numeric(0.02), N4 = as.numeric(0.02), C1 = as.numeric(0.02), C2 = as.numeric(0.02), C3 = as.numeric(0.02), C4 = as.numeric(0.02), C5 = as.numeric(0.02), M1 = as.numeric(0.02), M2 = as.numeric(0.02)) rownames(com.df) <- "TP53" neig.df <- data.frame(Proteins = "TP53", svm.pred.all = "Nuclear", Secretory = as.numeric(0.01), Nuclear = as.numeric(0.95), Cytosol = as.numeric(0.02), Mitochondria = as.numeric(0.02)) rownames(neig.df) <- "TP53" cls.df <- mergeCls(com.df, neig.df) }
Compartment levels classifications are summed up to associated neighborhood levels. It is a helper function.
mergeProbability(df)
mergeProbability(df)
df |
data.frame; all predictions at the neighborhood level and probablity vectors for each protein |
merged.df
{ #create mock data df <- data.frame(Protein = "TP53", S1 = as.numeric(0.02), S2 = as.numeric(0.02), S3 = as.numeric(0.02), S4 = as.numeric(0.02), N1 = as.numeric(0.72), N2 = as.numeric(0.02), N3 = as.numeric(0.02), N4 = as.numeric(0.02), C1 = as.numeric(0.02), C2 = as.numeric(0.02), C3 = as.numeric(0.02), C4 = as.numeric(0.02), C5 = as.numeric(0.02), M1 = as.numeric(0.02), M2 = as.numeric(0.02)) rownames(df) <- "TP53" merged.df <- mergeProbability(df) }
{ #create mock data df <- data.frame(Protein = "TP53", S1 = as.numeric(0.02), S2 = as.numeric(0.02), S3 = as.numeric(0.02), S4 = as.numeric(0.02), N1 = as.numeric(0.72), N2 = as.numeric(0.02), N3 = as.numeric(0.02), N4 = as.numeric(0.02), C1 = as.numeric(0.02), C2 = as.numeric(0.02), C3 = as.numeric(0.02), C4 = as.numeric(0.02), C5 = as.numeric(0.02), M1 = as.numeric(0.02), M2 = as.numeric(0.02)) rownames(df) <- "TP53" merged.df <- mergeProbability(df) }
Stacked bar plot are plotted for compartment and neighborhood level with respect to classification probabilities.
plotBarcode(sampleClassification, protein, s1PSM)
plotBarcode(sampleClassification, protein, s1PSM)
sampleClassification |
data.frame; merged classification, combination of compartment and neighborhood classification. |
protein |
character; protein gene symbol name |
s1PSM |
data.frame; minimum PSM count table. Row names should be gene centric protein id. |
proteinPlot
{ #create mock data plot.df <- data.frame(Protein = "TP53", NeighborhoodCls = "Nuclear", CompartmentCls = "N1", Secretory = as.numeric(0.01), Nuclear = as.numeric(0.95), Cytosol = as.numeric(0.02), Mitochondria = as.numeric(0.02), S1 = as.numeric(0.02), S2 = as.numeric(0.02), S3 = as.numeric(0.02), S4 = as.numeric(0.02), N1 = as.numeric(0.72), N2 = as.numeric(0.02), N3 = as.numeric(0.02), N4 = as.numeric(0.02), C1 = as.numeric(0.02), C2 = as.numeric(0.02), C3 = as.numeric(0.02), C4 = as.numeric(0.02), C5 = as.numeric(0.02), M1 = as.numeric(0.02), M2 = as.numeric(0.02)) rownames(plot.df) <- "TP53" psm.df <- data.frame(Protein = "TP53", PSMs.for.quant = as.numeric(31)) rownames(psm.df) <- "TP53" proteinPlot <- plotBarcode(plot.df, "TP53", psm.df) }
{ #create mock data plot.df <- data.frame(Protein = "TP53", NeighborhoodCls = "Nuclear", CompartmentCls = "N1", Secretory = as.numeric(0.01), Nuclear = as.numeric(0.95), Cytosol = as.numeric(0.02), Mitochondria = as.numeric(0.02), S1 = as.numeric(0.02), S2 = as.numeric(0.02), S3 = as.numeric(0.02), S4 = as.numeric(0.02), N1 = as.numeric(0.72), N2 = as.numeric(0.02), N3 = as.numeric(0.02), N4 = as.numeric(0.02), C1 = as.numeric(0.02), C2 = as.numeric(0.02), C3 = as.numeric(0.02), C4 = as.numeric(0.02), C5 = as.numeric(0.02), M1 = as.numeric(0.02), M2 = as.numeric(0.02)) rownames(plot.df) <- "TP53" psm.df <- data.frame(Protein = "TP53", PSMs.for.quant = as.numeric(31)) rownames(psm.df) <- "TP53" proteinPlot <- plotBarcode(plot.df, "TP53", psm.df) }
Distributions of subcellular localizations of multiple proteins both ar the compartment and neighborhood level are plotted.
plotMultipleProtein(sampleClassification, proteinList)
plotMultipleProtein(sampleClassification, proteinList)
sampleClassification |
data.frame; merged classification, combination of compartment and neighborhood classifications per protein. |
proteinList |
vector; protein gene symbol names. |
multipleProt.df
{ proteasome26s <- c("PSMA7", "PSMC3", "PSMB1", "PSMA1", "PSMA3","PSMA4", "PSMA5", "PSMB4", "PSMB6", "PSMB5","PSMC2", "PSMC4", "PSMB3", "PSMB2", "PSMD4", "PSMA6", "PSMC1", "PSMC5", "PSMC6", "PSMB7", "PSMD13") exp.cls.df <- SubCellBarCode::hcc827GEFClass multipleProt.df <- plotMultipleProtein(exp.cls.df, proteasome26s ) }
{ proteasome26s <- c("PSMA7", "PSMC3", "PSMB1", "PSMA1", "PSMA3","PSMA4", "PSMA5", "PSMB4", "PSMB6", "PSMB5","PSMC2", "PSMC4", "PSMB3", "PSMB2", "PSMD4", "PSMA6", "PSMC1", "PSMC5", "PSMC6", "PSMB7", "PSMD13") exp.cls.df <- SubCellBarCode::hcc827GEFClass multipleProt.df <- plotMultipleProtein(exp.cls.df, proteasome26s ) }
Compartment level classifications are replaced with neighborhood level assignment. It is a helper function.
replacePrediction(df, column = c("svm.pred.all", "Observation", "svm.pred"))
replacePrediction(df, column = c("svm.pred.all", "Observation", "svm.pred"))
df |
data.frame; all predictions at the compartment level and probablity vectors for each protein |
column |
character; selected column in the data frame, df |
replaced.df
{ #define mock data frame df <- data.frame(svm.pred.all = c("S1","S2","S3","S4", "N1","N2","N3","N4", "C1","C2","C3","C4","C5", "M1","M2")) df$svm.pred.all <- as.character(df$svm.pred.all) df$Prob <- "1" df <- replacePrediction(df, column = "svm.pred.all") }
{ #define mock data frame df <- data.frame(svm.pred.all = c("S1","S2","S3","S4", "N1","N2","N3","N4", "C1","C2","C3","C4","C5", "M1","M2")) df$svm.pred.all <- as.character(df$svm.pred.all) df$Prob <- "1" df <- replacePrediction(df, column = "svm.pred.all") }
Identify candidate condition-dependent relocated proteins by comparing neighborhood classifications.
sankeyPlot(sampleCls1, sampleCls2)
sankeyPlot(sampleCls1, sampleCls2)
sampleCls1 |
data.frame; merged classification, combination of compartment and neighborhood classification. |
sampleCls2 |
data.frame; merged classification, combination of compartment and neighborhood classification. |
label.link.df
{ exp.cls.df <- SubCellBarCode::hcc827GEFClass sankeyData <- sankeyPlot(exp.cls.df, exp.cls.df) }
{ exp.cls.df <- SubCellBarCode::hcc827GEFClass sankeyData <- sankeyPlot(exp.cls.df, exp.cls.df) }
Compartment levels classifications on the test data are summed up to associated neighborhood levels. It is a helper function.
sumProbability(df)
sumProbability(df)
df |
data.frame; test data classifications at the neighborhood level and probablity vectors for each protein. |
summed.df
{ #create mock data df <- data.frame(Protein = "TP53", svm.pred = "N1", S1 = as.numeric(0.02), S2 = as.numeric(0.02), S3 = as.numeric(0.02), S4 = as.numeric(0.02), N1 = as.numeric(0.72), N2 = as.numeric(0.02), N3 = as.numeric(0.02), N4 = as.numeric(0.02), C1 = as.numeric(0.02), C2 = as.numeric(0.02), C3 = as.numeric(0.02), C4 = as.numeric(0.02), C5 = as.numeric(0.02), M1 = as.numeric(0.02), M2 = as.numeric(0.02)) rownames(df) <- "TP53" sum.df <- sumProbability(df) }
{ #create mock data df <- data.frame(Protein = "TP53", svm.pred = "N1", S1 = as.numeric(0.02), S2 = as.numeric(0.02), S3 = as.numeric(0.02), S4 = as.numeric(0.02), N1 = as.numeric(0.72), N2 = as.numeric(0.02), N3 = as.numeric(0.02), N4 = as.numeric(0.02), C1 = as.numeric(0.02), C2 = as.numeric(0.02), C3 = as.numeric(0.02), C4 = as.numeric(0.02), C5 = as.numeric(0.02), M1 = as.numeric(0.02), M2 = as.numeric(0.02)) rownames(df) <- "TP53" sum.df <- sumProbability(df) }
Support Vector Machine classifier is trained and used for prediction of protein subcellular localization
svmClassification(markerProteins, protein.data, markerprot.df)
svmClassification(markerProteins, protein.data, markerprot.df)
markerProteins |
character; robust marker proteins along with subcellular localization that are present in the given data. |
protein.data |
data.frame; fractionated proteomics data |
markerprot.df |
data.frame; collection of marker proteins along with corresponding subcellular localization |
all.classifications
{ df <- loadData(SubCellBarCode::hcc827Ctrl) c.prots <- calculateCoveredProtein(rownames(df), markerProteins[,1]) set.seed(7) c.prots <- sample(c.prots, 500) cls <- svmClassification(c.prots, df, markerProteins) }
{ df <- loadData(SubCellBarCode::hcc827Ctrl) c.prots <- calculateCoveredProtein(rownames(df), markerProteins[,1]) set.seed(7) c.prots <- sample(c.prots, 500) cls <- svmClassification(c.prots, df, markerProteins) }
Peptide/exon/transcript centric or PTM enriched classification is applied to predict localization of them.
svmExternalData(df, modelA, modelB)
svmExternalData(df, modelA, modelB)
df |
data frame fractionated additional data |
modelA |
model for the replicate A classification |
modelB |
model for the replicate B classification |
c.cls.df
{ df <- loadData(SubCellBarCode::hcc827Ctrl) c.prots <- calculateCoveredProtein(rownames(df), markerProteins[,1]) set.seed(7) c.prots <- sample(c.prots, 550) cls <- svmClassification(c.prots, df, markerProteins) modelA <- cls[[1]]$model modelB <- cls[[2]]$model exon.cls <- svmExternalData(SubCellBarCode::hcc827exon, modelA = modelA, modelB = modelB) }
{ df <- loadData(SubCellBarCode::hcc827Ctrl) c.prots <- calculateCoveredProtein(rownames(df), markerProteins[,1]) set.seed(7) c.prots <- sample(c.prots, 550) cls <- svmClassification(c.prots, df, markerProteins) modelA <- cls[[1]]$model modelB <- cls[[2]]$model exon.cls <- svmExternalData(SubCellBarCode::hcc827exon, modelA = modelA, modelB = modelB) }
The marker proteins are visualized in 3D t-SNE map to see the distributions of the marker proteins.
tsneVisualization(protein.data, markerProteins, dims, theta, perplexity)
tsneVisualization(protein.data, markerProteins, dims, theta, perplexity)
protein.data |
data.frame; fractionated proteomics data |
markerProteins |
character; robust marker proteins, gene symbols, that are present in the given data and overlapped with package's marker protein list. |
dims |
integer; dimensionality |
theta |
numeric; Speed/accuracy trade-off ,increase for less accuracy |
perplexity |
integer; Perplexity parameter |
tsneMap.df
{ df <- loadData(SubCellBarCode::hcc827Ctrl) c.prots <- calculateCoveredProtein(rownames(df), markerProteins[,1]) set.seed(21) tsneMap.df <- tsneVisualization(protein.data = df, markerProteins = c.prots[1:20], dims = 2, theta = c(0.4), perplexity = c(5)) }
{ df <- loadData(SubCellBarCode::hcc827Ctrl) c.prots <- calculateCoveredProtein(rownames(df), markerProteins[,1]) set.seed(21) tsneMap.df <- tsneVisualization(protein.data = df, markerProteins = c.prots[1:20], dims = 2, theta = c(0.4), perplexity = c(5)) }