Title: | Supervised mutational signatures |
---|---|
Description: | Generate SuperSigs (supervised mutational signatures) from single nucleotide variants in the cancer genome. Functions included in the package allow the user to learn supervised mutational signatures from their data and apply them to new data. The methodology is based on the one described in Afsari (2021, ELife). |
Authors: | Albert Kuo [aut, cre] , Yifan Zhang [aut], Bahman Afsari [aut], Cristian Tomasetti [aut] |
Maintainer: | Albert Kuo <[email protected]> |
License: | GPL-3 |
Version: | 1.15.0 |
Built: | 2024-12-18 04:28:49 UTC |
Source: | https://github.com/bioc/supersigs |
A dataset containing a list of mutations and other necessary attributes
example_dt
example_dt
A data frame with 10 rows and 5 columns:
ID of the patient
age of the patient
chromosomal position of the mutation
position of the mutation
original nucleotide
mutated nucleotide
Generate a tissue-specific SuperSig for a given dataset of mutations and exposure factor. Returns the SuperSig and a classification model trained with the SuperSig.
get_signature(data, factor, wgs = FALSE)
get_signature(data, factor, wgs = FALSE)
data |
a data frame of mutations containing columns for
|
factor |
the factor/exposure (e.g. "age", "smoking"). If the factor = "age", the SuperSig is computed using counts. Otherwise, rates (counts/age) are used. |
wgs |
logical value indicating whether sequencing data is
whole-genome (wgs = |
get_signature
returns an object of class SuperSig
head(example_dt) # use example data from package input_dt <- make_matrix(example_dt) # convert to correct format input_dt$IndVar <- c(1, 1, 1, 0, 0) # add IndVar column get_signature(data = input_dt, factor = "Age") # get SuperSig
head(example_dt) # use example data from package input_dt <- make_matrix(example_dt) # convert to correct format input_dt$IndVar <- c(1, 1, 1, 0, 0) # add IndVar column get_signature(data = input_dt, factor = "Age") # get SuperSig
Transform a data frame of mutations in long format into a data frame of trinucleotide mutations with flanking bases in a wide matrix format.
make_matrix(data, genome = "hg19")
make_matrix(data, genome = "hg19")
data |
a data frame of mutations in VCF format (see vignette for details) |
genome |
the reference genome used ("hg19" or "hg38") |
make_matrix
returns a data frame of mutations,
one row per sample
head(example_dt) # use example data from package input_dt <- make_matrix(example_dt) # convert to correct format head(input_dt)
head(example_dt) # use example data from package input_dt <- make_matrix(example_dt) # convert to correct format head(input_dt)
Remove the contribution of a SuperSig from the data and return the data.
partial_signature(data, object)
partial_signature(data, object)
data |
a data frame of mutations containing columns for
|
object |
an object of class |
predict_signature
returns the original data frame with
the contribution of a supervised signature removed
head(example_dt) # use example data from package input_dt <- make_matrix(example_dt) # convert to correct format input_dt$IndVar <- c(1, 1, 1, 0, 0) # add IndVar column supersig <- get_signature(data = input_dt, factor = "Age") # get SuperSig partial_signature(data = input_dt, object = supersig)
head(example_dt) # use example data from package input_dt <- make_matrix(example_dt) # convert to correct format input_dt$IndVar <- c(1, 1, 1, 0, 0) # add IndVar column supersig <- get_signature(data = input_dt, factor = "Age") # get SuperSig partial_signature(data = input_dt, object = supersig)
Using a generated SuperSig, predict on a new dataset and return predicted probabilities for each observation.
predict_signature(object, newdata, factor)
predict_signature(object, newdata, factor)
object |
an object of class |
newdata |
a data frame of mutations containing columns for
|
factor |
the factor/exposure (e.g. "age", "smoking") |
predict_signature
returns the original data frame with
additional columns for the feature counts and classification score
head(example_dt) # use example data from package input_dt <- make_matrix(example_dt) # convert to correct format input_dt$IndVar <- c(1, 1, 1, 0, 0) # add IndVar column out <- get_signature(data = input_dt, factor = "Age") # get SuperSig newdata <- predict_signature(out, newdata = input_dt, factor = "age") suppressPackageStartupMessages({library(dplyr)}) head(newdata %>% select(score))
head(example_dt) # use example data from package input_dt <- make_matrix(example_dt) # convert to correct format input_dt$IndVar <- c(1, 1, 1, 0, 0) # add IndVar column out <- get_signature(data = input_dt, factor = "Age") # get SuperSig newdata <- predict_signature(out, newdata = input_dt, factor = "age") suppressPackageStartupMessages({library(dplyr)}) head(newdata %>% select(score))
Transform a VCF object into a data frame of trinucleotide mutations with flanking bases in a wide matrix format. The function assumes that the VCF object contains only one sample and that each row in rowRanges represents an observed mutation in the sample.
process_vcf(vcf)
process_vcf(vcf)
vcf |
a VCF object (from |
process_vcf
returns a data frame of mutations,
one row per mutation
# Use example vcf from VariantAnnotation suppressPackageStartupMessages({library(VariantAnnotation)}) fl <- system.file("extdata", "chr22.vcf.gz", package="VariantAnnotation") vcf <- VariantAnnotation::readVcf(fl, "hg19") # Subset to first sample vcf <- vcf[, 1] # Subset to row positions with homozygous or heterozygous alt positions <- geno(vcf)$GT != "0|0" vcf <- vcf[positions[, 1],] colData(vcf)$age <- 50 # Add patient age to colData (optional) # Run function dt <- process_vcf(vcf) head(dt)
# Use example vcf from VariantAnnotation suppressPackageStartupMessages({library(VariantAnnotation)}) fl <- system.file("extdata", "chr22.vcf.gz", package="VariantAnnotation") vcf <- VariantAnnotation::readVcf(fl, "hg19") # Subset to first sample vcf <- vcf[, 1] # Subset to row positions with homozygous or heterozygous alt positions <- geno(vcf)$GT != "0|0" vcf <- vcf[positions[, 1],] colData(vcf)$age <- 50 # Add patient age to colData (optional) # Run function dt <- process_vcf(vcf) head(dt)
Take a signature representation from SuperSig and group trinucleotides within each feature into interpretable labels, with optional IUPAC labeling from IUPAC_CODE_MAP in the Biostrings package
simplify_signature(object, iupac)
simplify_signature(object, iupac)
object |
an object of class |
iupac |
logical value indicating whether to use IUPAC labels
(iupac = |
simplify_signature
returns a vector of
simplified features and their difference in mean
mean rates between exposed and unexposed (or
average rate if the factor is "age")
head(example_dt) # use example data from package input_dt <- make_matrix(example_dt) # convert to correct format input_dt$IndVar <- c(1, 1, 1, 0, 0) # add IndVar column supersig <- get_signature(data = input_dt, factor = "Smoking") simplify_signature(object = supersig, iupac = FALSE) simplify_signature(object = supersig, iupac = TRUE)
head(example_dt) # use example data from package input_dt <- make_matrix(example_dt) # convert to correct format input_dt$IndVar <- c(1, 1, 1, 0, 0) # add IndVar column supersig <- get_signature(data = input_dt, factor = "Smoking") simplify_signature(object = supersig, iupac = FALSE) simplify_signature(object = supersig, iupac = TRUE)
A list containing 67 SuperSigs
supersig_ls
supersig_ls
A named list with 67 elements, each of which is a 'SuperSig'
An S4 class for SuperSig
Signature
data frame of features and their difference in mean rates between exposed and unexposed (or the average rate if the factor is "age")
Features
list of features that comprise the signature and their representation in terms of the fundamental (trinucleotide) mutations
AUC
length-one numeric vector of the apparent AUC (i.e. not cross-validated)
Model
list of a glm class for trained logistic regression model